summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/Kconfig.debug2
-rw-r--r--mm/cma.c2
-rw-r--r--mm/compaction.c1
-rw-r--r--mm/damon/core.c39
-rw-r--r--mm/damon/ops-common.c38
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c163
-rw-r--r--mm/damon/reclaim.c19
-rw-r--r--mm/damon/sysfs-schemes.c373
-rw-r--r--mm/damon/vaddr.c68
-rw-r--r--mm/debug_vm_pgtable.c102
-rw-r--r--mm/fadvise.c5
-rw-r--r--mm/hmm.c15
-rw-r--r--mm/huge_memory.c64
-rw-r--r--mm/hugetlb.c114
-rw-r--r--mm/internal.h19
-rw-r--r--mm/kasan/common.c18
-rw-r--r--mm/kasan/hw_tags.c60
-rw-r--r--mm/kasan/kasan.h27
-rw-r--r--mm/madvise.c100
-rw-r--r--mm/memcontrol.c78
-rw-r--r--mm/memfd.c56
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c55
-rw-r--r--mm/mempolicy.c35
-rw-r--r--mm/migrate.c25
-rw-r--r--mm/mprotect.c105
-rw-r--r--mm/nommu.c62
-rw-r--r--mm/page-writeback.c59
-rw-r--r--mm/page_alloc.c60
-rw-r--r--mm/page_idle.c47
-rw-r--r--mm/page_io.c1
-rw-r--r--mm/page_reporting.c6
-rw-r--r--mm/page_vma_mapped.c9
-rw-r--r--mm/pagewalk.c6
-rw-r--r--mm/rmap.c68
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h8
-rw-r--r--mm/slab_common.c1
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swap.c26
-rw-r--r--mm/swap_state.c59
-rw-r--r--mm/swapfile.c7
-rw-r--r--mm/userfaultfd.c49
-rw-r--r--mm/util.c24
-rw-r--r--mm/vmalloc.c85
-rw-r--r--mm/vmscan.c738
-rw-r--r--mm/workingset.c7
-rw-r--r--mm/z3fold.c2
-rw-r--r--mm/zsmalloc.c3
52 files changed, 2005 insertions, 932 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ff7b209dec05..39df30dcabe3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1073,7 +1073,7 @@ config GUP_TEST
pin_user_pages*(), or pinned via get_user_pages*(), as specified
by other command line arguments.
- See tools/testing/selftests/vm/gup_test.c
+ See tools/testing/selftests/mm/gup_test.c
comment "GUP_TEST needs to have DEBUG_FS enabled"
depends on !GUP_TEST && !DEBUG_FS
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index fca699ad1fb0..d62f48131952 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -90,7 +90,7 @@ config PAGE_OWNER
help to find bare alloc_page(s) leaks. Even if you include this
feature on your build, it is disabled in default. You should pass
"page_owner=on" to boot parameter in order to enable it. Eats
- a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
+ a fair amount of memory if enabled. See tools/mm/page_owner_sort.c
for user-space helper.
If unsure, say N.
diff --git a/mm/cma.c b/mm/cma.c
index 4a978e09547a..a75b17b03b66 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -491,7 +491,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
start = bitmap_no + mask + 1;
}
- trace_cma_alloc_finish(cma->name, pfn, page, count, align);
+ trace_cma_alloc_finish(cma->name, pfn, page, count, align, ret);
/*
* CMA can allocate multiple page blocks, which results in different
diff --git a/mm/compaction.c b/mm/compaction.c
index ca1603524bbe..62a61de44658 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -122,7 +122,6 @@ bool PageMovable(struct page *page)
return false;
}
-EXPORT_SYMBOL(PageMovable);
void __SetPageMovable(struct page *page, const struct movable_operations *mops)
{
diff --git a/mm/damon/core.c b/mm/damon/core.c
index ceec75b88ef9..1bf0654ae189 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -263,6 +263,40 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
return 0;
}
+struct damos_filter *damos_new_filter(enum damos_filter_type type,
+ bool matching)
+{
+ struct damos_filter *filter;
+
+ filter = kmalloc(sizeof(*filter), GFP_KERNEL);
+ if (!filter)
+ return NULL;
+ filter->type = type;
+ filter->matching = matching;
+ return filter;
+}
+
+void damos_add_filter(struct damos *s, struct damos_filter *f)
+{
+ list_add_tail(&f->list, &s->filters);
+}
+
+static void damos_del_filter(struct damos_filter *f)
+{
+ list_del(&f->list);
+}
+
+static void damos_free_filter(struct damos_filter *f)
+{
+ kfree(f);
+}
+
+void damos_destroy_filter(struct damos_filter *f)
+{
+ damos_del_filter(f);
+ damos_free_filter(f);
+}
+
/* initialize private fields of damos_quota and return the pointer */
static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
{
@@ -287,6 +321,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
return NULL;
scheme->pattern = *pattern;
scheme->action = action;
+ INIT_LIST_HEAD(&scheme->filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -315,6 +350,10 @@ static void damon_free_scheme(struct damos *s)
void damon_destroy_scheme(struct damos *s)
{
+ struct damos_filter *f, *next;
+
+ damos_for_each_filter_safe(f, next, s)
+ damos_destroy_filter(f);
damon_del_scheme(s);
damon_free_scheme(s);
}
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 75409601f934..cc63cf953636 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -16,29 +16,33 @@
* Get an online page for a pfn if it's in the LRU list. Otherwise, returns
* NULL.
*
- * The body of this function is stolen from the 'page_idle_get_page()'. We
+ * The body of this function is stolen from the 'page_idle_get_folio()'. We
* steal rather than reuse it because the code is quite simple.
*/
-struct page *damon_get_page(unsigned long pfn)
+struct folio *damon_get_folio(unsigned long pfn)
{
struct page *page = pfn_to_online_page(pfn);
+ struct folio *folio;
- if (!page || !PageLRU(page) || !get_page_unless_zero(page))
+ if (!page || PageTail(page))
return NULL;
- if (unlikely(!PageLRU(page))) {
- put_page(page);
- page = NULL;
+ folio = page_folio(page);
+ if (!folio_test_lru(folio) || !folio_try_get(folio))
+ return NULL;
+ if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
+ folio_put(folio);
+ folio = NULL;
}
- return page;
+ return folio;
}
void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
{
bool referenced = false;
- struct page *page = damon_get_page(pte_pfn(*pte));
+ struct folio *folio = damon_get_folio(pte_pfn(*pte));
- if (!page)
+ if (!folio)
return;
if (pte_young(*pte)) {
@@ -52,19 +56,19 @@ void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
- set_page_young(page);
+ folio_set_young(folio);
- set_page_idle(page);
- put_page(page);
+ folio_set_idle(folio);
+ folio_put(folio);
}
void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool referenced = false;
- struct page *page = damon_get_page(pmd_pfn(*pmd));
+ struct folio *folio = damon_get_folio(pmd_pfn(*pmd));
- if (!page)
+ if (!folio)
return;
if (pmd_young(*pmd)) {
@@ -78,10 +82,10 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
- set_page_young(page);
+ folio_set_young(folio);
- set_page_idle(page);
- put_page(page);
+ folio_set_idle(folio);
+ folio_put(folio);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 8d82d3722204..14f4bc69f29b 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -7,7 +7,7 @@
#include <linux/damon.h>
-struct page *damon_get_page(unsigned long pfn);
+struct folio *damon_get_folio(unsigned long pfn);
void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index e1a4315c4be6..b4df9b9bcc0a 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -33,17 +33,15 @@ static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
static void damon_pa_mkold(unsigned long paddr)
{
- struct folio *folio;
- struct page *page = damon_get_page(PHYS_PFN(paddr));
+ struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
struct rmap_walk_control rwc = {
.rmap_one = __damon_pa_mkold,
.anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
- if (!page)
+ if (!folio)
return;
- folio = page_folio(page);
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
folio_set_idle(folio);
@@ -81,69 +79,57 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
}
}
-struct damon_pa_access_chk_result {
- unsigned long page_sz;
- bool accessed;
-};
-
static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
- struct damon_pa_access_chk_result *result = arg;
+ bool *accessed = arg;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
- result->accessed = false;
- result->page_sz = PAGE_SIZE;
+ *accessed = false;
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
if (pvmw.pte) {
- result->accessed = pte_young(*pvmw.pte) ||
+ *accessed = pte_young(*pvmw.pte) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- result->accessed = pmd_young(*pvmw.pmd) ||
+ *accessed = pmd_young(*pvmw.pmd) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
- result->page_sz = HPAGE_PMD_SIZE;
#else
WARN_ON_ONCE(1);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
}
- if (result->accessed) {
+ if (*accessed) {
page_vma_mapped_walk_done(&pvmw);
break;
}
}
/* If accessed, stop walking */
- return !result->accessed;
+ return *accessed == false;
}
-static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
+static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
{
- struct folio *folio;
- struct page *page = damon_get_page(PHYS_PFN(paddr));
- struct damon_pa_access_chk_result result = {
- .page_sz = PAGE_SIZE,
- .accessed = false,
- };
+ struct folio *folio = damon_get_folio(PHYS_PFN(paddr));
+ bool accessed = false;
struct rmap_walk_control rwc = {
- .arg = &result,
+ .arg = &accessed,
.rmap_one = __damon_pa_young,
.anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
- if (!page)
+ if (!folio)
return false;
- folio = page_folio(page);
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
if (folio_test_idle(folio))
- result.accessed = false;
+ accessed = false;
else
- result.accessed = true;
+ accessed = true;
folio_put(folio);
goto out;
}
@@ -161,25 +147,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
folio_put(folio);
out:
- *page_sz = result.page_sz;
- return result.accessed;
+ *folio_sz = folio_size(folio);
+ return accessed;
}
static void __damon_pa_check_access(struct damon_region *r)
{
static unsigned long last_addr;
- static unsigned long last_page_sz = PAGE_SIZE;
+ static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
/* If the region is in the last checked page, reuse the result */
- if (ALIGN_DOWN(last_addr, last_page_sz) ==
- ALIGN_DOWN(r->sampling_addr, last_page_sz)) {
+ if (ALIGN_DOWN(last_addr, last_folio_sz) ==
+ ALIGN_DOWN(r->sampling_addr, last_folio_sz)) {
if (last_accessed)
r->nr_accesses++;
return;
}
- last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz);
+ last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz);
if (last_accessed)
r->nr_accesses++;
@@ -202,63 +188,116 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static unsigned long damon_pa_pageout(struct damon_region *r)
+static bool __damos_pa_filter_out(struct damos_filter *filter,
+ struct folio *folio)
+{
+ bool matched = false;
+ struct mem_cgroup *memcg;
+
+ switch (filter->type) {
+ case DAMOS_FILTER_TYPE_ANON:
+ matched = folio_test_anon(folio);
+ break;
+ case DAMOS_FILTER_TYPE_MEMCG:
+ rcu_read_lock();
+ memcg = folio_memcg_check(folio);
+ if (!memcg)
+ matched = false;
+ else
+ matched = filter->memcg_id == mem_cgroup_id(memcg);
+ rcu_read_unlock();
+ break;
+ default:
+ break;
+ }
+
+ return matched == filter->matching;
+}
+
+/*
+ * damos_pa_filter_out - Return true if the page should be filtered out.
+ */
+static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio)
+{
+ struct damos_filter *filter;
+
+ damos_for_each_filter(filter, scheme) {
+ if (__damos_pa_filter_out(filter, folio))
+ return true;
+ }
+ return false;
+}
+
+static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
{
unsigned long addr, applied;
- LIST_HEAD(page_list);
+ LIST_HEAD(folio_list);
for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct page *page = damon_get_page(PHYS_PFN(addr));
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+ if (!folio)
+ continue;
- if (!page)
+ if (damos_pa_filter_out(s, folio)) {
+ folio_put(folio);
continue;
+ }
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
- if (isolate_lru_page(page)) {
- put_page(page);
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
+ if (folio_isolate_lru(folio)) {
+ folio_put(folio);
continue;
}
- if (PageUnevictable(page)) {
- putback_lru_page(page);
+ if (folio_test_unevictable(folio)) {
+ folio_putback_lru(folio);
} else {
- list_add(&page->lru, &page_list);
- put_page(page);
+ list_add(&folio->lru, &folio_list);
+ folio_put(folio);
}
}
- applied = reclaim_pages(&page_list);
+ applied = reclaim_pages(&folio_list);
cond_resched();
return applied * PAGE_SIZE;
}
static inline unsigned long damon_pa_mark_accessed_or_deactivate(
- struct damon_region *r, bool mark_accessed)
+ struct damon_region *r, struct damos *s, bool mark_accessed)
{
unsigned long addr, applied = 0;
for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct page *page = damon_get_page(PHYS_PFN(addr));
+ struct folio *folio = damon_get_folio(PHYS_PFN(addr));
+
+ if (!folio)
+ continue;
- if (!page)
+ if (damos_pa_filter_out(s, folio)) {
+ folio_put(folio);
continue;
+ }
+
if (mark_accessed)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
else
- deactivate_page(page);
- put_page(page);
- applied++;
+ folio_deactivate(folio);
+ folio_put(folio);
+ applied += folio_nr_pages(folio);
}
return applied * PAGE_SIZE;
}
-static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+static unsigned long damon_pa_mark_accessed(struct damon_region *r,
+ struct damos *s)
{
- return damon_pa_mark_accessed_or_deactivate(r, true);
+ return damon_pa_mark_accessed_or_deactivate(r, s, true);
}
-static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r,
+ struct damos *s)
{
- return damon_pa_mark_accessed_or_deactivate(r, false);
+ return damon_pa_mark_accessed_or_deactivate(r, s, false);
}
static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
@@ -267,11 +306,11 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
{
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pa_pageout(r);
+ return damon_pa_pageout(r, scheme);
case DAMOS_LRU_PRIO:
- return damon_pa_mark_accessed(r);
+ return damon_pa_mark_accessed(r, scheme);
case DAMOS_LRU_DEPRIO:
- return damon_pa_deactivate_pages(r);
+ return damon_pa_deactivate_pages(r, scheme);
case DAMOS_STAT:
break;
default:
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index e82631f39481..648d2a85523a 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -99,6 +99,15 @@ static unsigned long monitor_region_end __read_mostly;
module_param(monitor_region_end, ulong, 0600);
/*
+ * Skip anonymous pages reclamation.
+ *
+ * If this parameter is set as ``Y``, DAMON_RECLAIM does not reclaim anonymous
+ * pages. By default, ``N``.
+ */
+static bool skip_anon __read_mostly;
+module_param(skip_anon, bool, 0600);
+
+/*
* PID of the DAMON thread
*
* If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
@@ -142,6 +151,7 @@ static struct damos *damon_reclaim_new_scheme(void)
static int damon_reclaim_apply_parameters(void)
{
struct damos *scheme;
+ struct damos_filter *filter;
int err = 0;
err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
@@ -152,6 +162,15 @@ static int damon_reclaim_apply_parameters(void)
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
+ if (skip_anon) {
+ filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
+ if (!filter) {
+ /* Will be freed by next 'damon_set_schemes()' below */
+ damon_destroy_scheme(scheme);
+ return -ENOMEM;
+ }
+ damos_add_filter(scheme, filter);
+ }
damon_set_schemes(ctx, &scheme, 1);
return damon_set_region_biggest_system_ram_default(target,
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 81fc4d27f4e4..86edca66aab1 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -259,6 +259,257 @@ static struct kobj_type damon_sysfs_stats_ktype = {
};
/*
+ * filter directory
+ */
+
+struct damon_sysfs_scheme_filter {
+ struct kobject kobj;
+ enum damos_filter_type type;
+ bool matching;
+ char *memcg_path;
+};
+
+static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_scheme_filter), GFP_KERNEL);
+}
+
+/* Should match with enum damos_filter_type */
+static const char * const damon_sysfs_scheme_filter_type_strs[] = {
+ "anon",
+ "memcg",
+};
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damon_sysfs_scheme_filter_type_strs[filter->type]);
+}
+
+static ssize_t type_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ enum damos_filter_type type;
+ ssize_t ret = -EINVAL;
+
+ for (type = 0; type < NR_DAMOS_FILTER_TYPES; type++) {
+ if (sysfs_streq(buf, damon_sysfs_scheme_filter_type_strs[
+ type])) {
+ filter->type = type;
+ ret = count;
+ break;
+ }
+ }
+ return ret;
+}
+
+static ssize_t matching_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%c\n", filter->matching ? 'Y' : 'N');
+}
+
+static ssize_t matching_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ bool matching;
+ int err = kstrtobool(buf, &matching);
+
+ if (err)
+ return err;
+
+ filter->matching = matching;
+ return count;
+}
+
+static ssize_t memcg_path_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ filter->memcg_path ? filter->memcg_path : "");
+}
+
+static ssize_t memcg_path_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+ char *path = kmalloc(sizeof(*path) * (count + 1), GFP_KERNEL);
+
+ if (!path)
+ return -ENOMEM;
+
+ strscpy(path, buf, count + 1);
+ filter->memcg_path = path;
+ return count;
+}
+
+static void damon_sysfs_scheme_filter_release(struct kobject *kobj)
+{
+ struct damon_sysfs_scheme_filter *filter = container_of(kobj,
+ struct damon_sysfs_scheme_filter, kobj);
+
+ kfree(filter->memcg_path);
+ kfree(filter);
+}
+
+static struct kobj_attribute damon_sysfs_scheme_filter_type_attr =
+ __ATTR_RW_MODE(type, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr =
+ __ATTR_RW_MODE(matching, 0600);
+
+static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr =
+ __ATTR_RW_MODE(memcg_path, 0600);
+
+static struct attribute *damon_sysfs_scheme_filter_attrs[] = {
+ &damon_sysfs_scheme_filter_type_attr.attr,
+ &damon_sysfs_scheme_filter_matching_attr.attr,
+ &damon_sysfs_scheme_filter_memcg_path_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter);
+
+static struct kobj_type damon_sysfs_scheme_filter_ktype = {
+ .release = damon_sysfs_scheme_filter_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_filter_groups,
+};
+
+/*
+ * filters directory
+ */
+
+struct damon_sysfs_scheme_filters {
+ struct kobject kobj;
+ struct damon_sysfs_scheme_filter **filters_arr;
+ int nr;
+};
+
+static struct damon_sysfs_scheme_filters *
+damon_sysfs_scheme_filters_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_scheme_filters), GFP_KERNEL);
+}
+
+static void damon_sysfs_scheme_filters_rm_dirs(
+ struct damon_sysfs_scheme_filters *filters)
+{
+ struct damon_sysfs_scheme_filter **filters_arr = filters->filters_arr;
+ int i;
+
+ for (i = 0; i < filters->nr; i++)
+ kobject_put(&filters_arr[i]->kobj);
+ filters->nr = 0;
+ kfree(filters_arr);
+ filters->filters_arr = NULL;
+}
+
+static int damon_sysfs_scheme_filters_add_dirs(
+ struct damon_sysfs_scheme_filters *filters, int nr_filters)
+{
+ struct damon_sysfs_scheme_filter **filters_arr, *filter;
+ int err, i;
+
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ if (!nr_filters)
+ return 0;
+
+ filters_arr = kmalloc_array(nr_filters, sizeof(*filters_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!filters_arr)
+ return -ENOMEM;
+ filters->filters_arr = filters_arr;
+
+ for (i = 0; i < nr_filters; i++) {
+ filter = damon_sysfs_scheme_filter_alloc();
+ if (!filter) {
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&filter->kobj,
+ &damon_sysfs_scheme_filter_ktype,
+ &filters->kobj, "%d", i);
+ if (err) {
+ kobject_put(&filter->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(filters);
+ return err;
+ }
+
+ filters_arr[i] = filter;
+ filters->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_filters_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_scheme_filters *filters = container_of(kobj,
+ struct damon_sysfs_scheme_filters, kobj);
+
+ return sysfs_emit(buf, "%d\n", filters->nr);
+}
+
+static ssize_t nr_filters_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme_filters *filters;
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ filters = container_of(kobj, struct damon_sysfs_scheme_filters, kobj);
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_scheme_filters_add_dirs(filters, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_scheme_filters_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_scheme_filters, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_filters_nr_attr =
+ __ATTR_RW_MODE(nr_filters, 0600);
+
+static struct attribute *damon_sysfs_scheme_filters_attrs[] = {
+ &damon_sysfs_scheme_filters_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme_filters);
+
+static struct kobj_type damon_sysfs_scheme_filters_ktype = {
+ .release = damon_sysfs_scheme_filters_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_filters_groups,
+};
+
+/*
* watermarks directory
*/
@@ -784,6 +1035,7 @@ struct damon_sysfs_scheme {
struct damon_sysfs_access_pattern *access_pattern;
struct damon_sysfs_quotas *quotas;
struct damon_sysfs_watermarks *watermarks;
+ struct damon_sysfs_scheme_filters *filters;
struct damon_sysfs_stats *stats;
struct damon_sysfs_scheme_regions *tried_regions;
};
@@ -878,6 +1130,24 @@ static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
return err;
}
+static int damon_sysfs_scheme_set_filters(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_scheme_filters *filters =
+ damon_sysfs_scheme_filters_alloc();
+ int err;
+
+ if (!filters)
+ return -ENOMEM;
+ err = kobject_init_and_add(&filters->kobj,
+ &damon_sysfs_scheme_filters_ktype, &scheme->kobj,
+ "filters");
+ if (err)
+ kobject_put(&filters->kobj);
+ else
+ scheme->filters = filters;
+ return err;
+}
+
static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
{
struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
@@ -926,9 +1196,12 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
err = damon_sysfs_scheme_set_watermarks(scheme);
if (err)
goto put_quotas_access_pattern_out;
- err = damon_sysfs_scheme_set_stats(scheme);
+ err = damon_sysfs_scheme_set_filters(scheme);
if (err)
goto put_watermarks_quotas_access_pattern_out;
+ err = damon_sysfs_scheme_set_stats(scheme);
+ if (err)
+ goto put_filters_watermarks_quotas_access_pattern_out;
err = damon_sysfs_scheme_set_tried_regions(scheme);
if (err)
goto put_tried_regions_out;
@@ -937,6 +1210,9 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
put_tried_regions_out:
kobject_put(&scheme->tried_regions->kobj);
scheme->tried_regions = NULL;
+put_filters_watermarks_quotas_access_pattern_out:
+ kobject_put(&scheme->filters->kobj);
+ scheme->filters = NULL;
put_watermarks_quotas_access_pattern_out:
kobject_put(&scheme->watermarks->kobj);
scheme->watermarks = NULL;
@@ -956,6 +1232,8 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
damon_sysfs_quotas_rm_dirs(scheme->quotas);
kobject_put(&scheme->quotas->kobj);
kobject_put(&scheme->watermarks->kobj);
+ damon_sysfs_scheme_filters_rm_dirs(scheme->filters);
+ kobject_put(&scheme->filters->kobj);
kobject_put(&scheme->stats->kobj);
damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions);
kobject_put(&scheme->tried_regions->kobj);
@@ -1124,6 +1402,79 @@ struct kobj_type damon_sysfs_schemes_ktype = {
.default_groups = damon_sysfs_schemes_groups,
};
+static bool damon_sysfs_memcg_path_eq(struct mem_cgroup *memcg,
+ char *memcg_path_buf, char *path)
+{
+#ifdef CONFIG_MEMCG
+ cgroup_path(memcg->css.cgroup, memcg_path_buf, PATH_MAX);
+ if (sysfs_streq(memcg_path_buf, path))
+ return true;
+#endif /* CONFIG_MEMCG */
+ return false;
+}
+
+static int damon_sysfs_memcg_path_to_id(char *memcg_path, unsigned short *id)
+{
+ struct mem_cgroup *memcg;
+ char *path;
+ bool found = false;
+
+ if (!memcg_path)
+ return -EINVAL;
+
+ path = kmalloc(sizeof(*path) * PATH_MAX, GFP_KERNEL);
+ if (!path)
+ return -ENOMEM;
+
+ for (memcg = mem_cgroup_iter(NULL, NULL, NULL); memcg;
+ memcg = mem_cgroup_iter(NULL, memcg, NULL)) {
+ /* skip removed memcg */
+ if (!mem_cgroup_id(memcg))
+ continue;
+ if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) {
+ *id = mem_cgroup_id(memcg);
+ found = true;
+ break;
+ }
+ }
+
+ kfree(path);
+ return found ? 0 : -EINVAL;
+}
+
+static int damon_sysfs_set_scheme_filters(struct damos *scheme,
+ struct damon_sysfs_scheme_filters *sysfs_filters)
+{
+ int i;
+ struct damos_filter *filter, *next;
+
+ damos_for_each_filter_safe(filter, next, scheme)
+ damos_destroy_filter(filter);
+
+ for (i = 0; i < sysfs_filters->nr; i++) {
+ struct damon_sysfs_scheme_filter *sysfs_filter =
+ sysfs_filters->filters_arr[i];
+ struct damos_filter *filter =
+ damos_new_filter(sysfs_filter->type,
+ sysfs_filter->matching);
+ int err;
+
+ if (!filter)
+ return -ENOMEM;
+ if (filter->type == DAMOS_FILTER_TYPE_MEMCG) {
+ err = damon_sysfs_memcg_path_to_id(
+ sysfs_filter->memcg_path,
+ &filter->memcg_id);
+ if (err) {
+ damos_destroy_filter(filter);
+ return err;
+ }
+ }
+ damos_add_filter(scheme, filter);
+ }
+ return 0;
+}
+
static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_scheme *sysfs_scheme)
{
@@ -1132,6 +1483,10 @@ static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+ struct damon_sysfs_scheme_filters *sysfs_filters =
+ sysfs_scheme->filters;
+ struct damos *scheme;
+ int err;
struct damos_access_pattern pattern = {
.min_sz_region = access_pattern->sz->min,
@@ -1157,8 +1512,17 @@ static struct damos *damon_sysfs_mk_scheme(
.low = sysfs_wmarks->low,
};
- return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+ scheme = damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
&wmarks);
+ if (!scheme)
+ return NULL;
+
+ err = damon_sysfs_set_scheme_filters(scheme, sysfs_filters);
+ if (err) {
+ damon_destroy_scheme(scheme);
+ return NULL;
+ }
+ return scheme;
}
static void damon_sysfs_update_scheme(struct damos *scheme,
@@ -1169,6 +1533,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+ int err;
scheme->pattern.min_sz_region = access_pattern->sz->min;
scheme->pattern.max_sz_region = access_pattern->sz->max;
@@ -1191,6 +1556,10 @@ static void damon_sysfs_update_scheme(struct damos *scheme,
scheme->wmarks.high = sysfs_wmarks->high;
scheme->wmarks.mid = sysfs_wmarks->mid;
scheme->wmarks.low = sysfs_wmarks->low;
+
+ err = damon_sysfs_set_scheme_filters(scheme, sysfs_scheme->filters);
+ if (err)
+ damon_destroy_scheme(scheme);
}
int damon_sysfs_set_schemes(struct damon_ctx *ctx,
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 15f03df66db6..1fec16d7263e 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -335,9 +335,9 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
{
bool referenced = false;
pte_t entry = huge_ptep_get(pte);
- struct page *page = pte_page(entry);
+ struct folio *folio = pfn_folio(pte_pfn(entry));
- get_page(page);
+ folio_get(folio);
if (pte_young(entry)) {
referenced = true;
@@ -352,10 +352,10 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
- set_page_young(page);
+ folio_set_young(folio);
- set_page_idle(page);
- put_page(page);
+ folio_set_idle(folio);
+ folio_put(folio);
}
static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
@@ -422,7 +422,8 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
}
struct damon_young_walk_private {
- unsigned long *page_sz;
+ /* size of the folio for the access checked virtual memory address */
+ unsigned long *folio_sz;
bool young;
};
@@ -431,7 +432,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
{
pte_t *pte;
spinlock_t *ptl;
- struct page *page;
+ struct folio *folio;
struct damon_young_walk_private *priv = walk->private;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -446,16 +447,15 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
spin_unlock(ptl);
goto regular_page;
}
- page = damon_get_page(pmd_pfn(*pmd));
- if (!page)
+ folio = damon_get_folio(pmd_pfn(*pmd));
+ if (!folio)
goto huge_out;
- if (pmd_young(*pmd) || !page_is_idle(page) ||
+ if (pmd_young(*pmd) || !folio_test_idle(folio) ||
mmu_notifier_test_young(walk->mm,
- addr)) {
- *priv->page_sz = HPAGE_PMD_SIZE;
+ addr))
priv->young = true;
- }
- put_page(page);
+ *priv->folio_sz = HPAGE_PMD_SIZE;
+ folio_put(folio);
huge_out:
spin_unlock(ptl);
return 0;
@@ -469,15 +469,14 @@ regular_page:
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
if (!pte_present(*pte))
goto out;
- page = damon_get_page(pte_pfn(*pte));
- if (!page)
+ folio = damon_get_folio(pte_pfn(*pte));
+ if (!folio)
goto out;
- if (pte_young(*pte) || !page_is_idle(page) ||
- mmu_notifier_test_young(walk->mm, addr)) {
- *priv->page_sz = PAGE_SIZE;
+ if (pte_young(*pte) || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(walk->mm, addr))
priv->young = true;
- }
- put_page(page);
+ *priv->folio_sz = folio_size(folio);
+ folio_put(folio);
out:
pte_unmap_unlock(pte, ptl);
return 0;
@@ -490,7 +489,7 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
{
struct damon_young_walk_private *priv = walk->private;
struct hstate *h = hstate_vma(walk->vma);
- struct page *page;
+ struct folio *folio;
spinlock_t *ptl;
pte_t entry;
@@ -499,16 +498,15 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
if (!pte_present(entry))
goto out;
- page = pte_page(entry);
- get_page(page);
+ folio = pfn_folio(pte_pfn(entry));
+ folio_get(folio);
- if (pte_young(entry) || !page_is_idle(page) ||
- mmu_notifier_test_young(walk->mm, addr)) {
- *priv->page_sz = huge_page_size(h);
+ if (pte_young(entry) || !folio_test_idle(folio) ||
+ mmu_notifier_test_young(walk->mm, addr))
priv->young = true;
- }
+ *priv->folio_sz = huge_page_size(h);
- put_page(page);
+ folio_put(folio);
out:
spin_unlock(ptl);
@@ -524,10 +522,10 @@ static const struct mm_walk_ops damon_young_ops = {
};
static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
- unsigned long *page_sz)
+ unsigned long *folio_sz)
{
struct damon_young_walk_private arg = {
- .page_sz = page_sz,
+ .folio_sz = folio_sz,
.young = false,
};
@@ -547,18 +545,18 @@ static void __damon_va_check_access(struct mm_struct *mm,
struct damon_region *r, bool same_target)
{
static unsigned long last_addr;
- static unsigned long last_page_sz = PAGE_SIZE;
+ static unsigned long last_folio_sz = PAGE_SIZE;
static bool last_accessed;
/* If the region is in the last checked page, reuse the result */
- if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) ==
- ALIGN_DOWN(r->sampling_addr, last_page_sz))) {
+ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) ==
+ ALIGN_DOWN(r->sampling_addr, last_folio_sz))) {
if (last_accessed)
r->nr_accesses++;
return;
}
- last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz);
+ last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz);
if (last_accessed)
r->nr_accesses++;
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index c631ade3f1d2..bb3328f46126 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -15,6 +15,7 @@
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/kconfig.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/mm_types.h>
@@ -80,6 +81,7 @@ struct pgtable_debug_args {
unsigned long pmd_pfn;
unsigned long pte_pfn;
+ unsigned long fixed_alignment;
unsigned long fixed_pgd_pfn;
unsigned long fixed_p4d_pfn;
unsigned long fixed_pud_pfn;
@@ -430,7 +432,8 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
- if (!arch_vmap_pmd_supported(args->page_prot))
+ if (!arch_vmap_pmd_supported(args->page_prot) ||
+ args->fixed_alignment < PMD_SIZE)
return;
pr_debug("Validating PMD huge\n");
@@ -449,7 +452,8 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args)
{
pud_t pud;
- if (!arch_vmap_pud_supported(args->page_prot))
+ if (!arch_vmap_pud_supported(args->page_prot) ||
+ args->fixed_alignment < PUD_SIZE)
return;
pr_debug("Validating PUD huge\n");
@@ -1077,10 +1081,85 @@ debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order)
return page;
}
+/*
+ * Check if a physical memory range described by <pstart, pend> contains
+ * an area that is of size psize, and aligned to psize.
+ *
+ * Don't use address 0, an all-zeroes physical address might mask bugs, and
+ * it's not used on x86.
+ */
+static void __init phys_align_check(phys_addr_t pstart,
+ phys_addr_t pend, unsigned long psize,
+ phys_addr_t *physp, unsigned long *alignp)
+{
+ phys_addr_t aligned_start, aligned_end;
+
+ if (pstart == 0)
+ pstart = PAGE_SIZE;
+
+ aligned_start = ALIGN(pstart, psize);
+ aligned_end = aligned_start + psize;
+
+ if (aligned_end > aligned_start && aligned_end <= pend) {
+ *alignp = psize;
+ *physp = aligned_start;
+ }
+}
+
+static void __init init_fixed_pfns(struct pgtable_debug_args *args)
+{
+ u64 idx;
+ phys_addr_t phys, pstart, pend;
+
+ /*
+ * Initialize the fixed pfns. To do this, try to find a
+ * valid physical range, preferably aligned to PUD_SIZE,
+ * but settling for aligned to PMD_SIZE as a fallback. If
+ * neither of those is found, use the physical address of
+ * the start_kernel symbol.
+ *
+ * The memory doesn't need to be allocated, it just needs to exist
+ * as usable memory. It won't be touched.
+ *
+ * The alignment is recorded, and can be checked to see if we
+ * can run the tests that require an actual valid physical
+ * address range on some architectures ({pmd,pud}_huge_test
+ * on x86).
+ */
+
+ phys = __pa_symbol(&start_kernel);
+ args->fixed_alignment = PAGE_SIZE;
+
+ for_each_mem_range(idx, &pstart, &pend) {
+ /* First check for a PUD-aligned area */
+ phys_align_check(pstart, pend, PUD_SIZE, &phys,
+ &args->fixed_alignment);
+
+ /* If a PUD-aligned area is found, we're done */
+ if (args->fixed_alignment == PUD_SIZE)
+ break;
+
+ /*
+ * If no PMD-aligned area found yet, check for one,
+ * but continue the loop to look for a PUD-aligned area.
+ */
+ if (args->fixed_alignment < PMD_SIZE)
+ phys_align_check(pstart, pend, PMD_SIZE, &phys,
+ &args->fixed_alignment);
+ }
+
+ args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
+ args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
+ args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
+ args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
+ args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
+ WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+}
+
+
static int __init init_args(struct pgtable_debug_args *args)
{
struct page *page = NULL;
- phys_addr_t phys;
int ret = 0;
/*
@@ -1160,22 +1239,7 @@ static int __init init_args(struct pgtable_debug_args *args)
args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp));
WARN_ON(!args->start_ptep);
- /*
- * PFN for mapping at PTE level is determined from a standard kernel
- * text symbol. But pfns for higher page table levels are derived by
- * masking lower bits of this real pfn. These derived pfns might not
- * exist on the platform but that does not really matter as pfn_pxx()
- * helpers will still create appropriate entries for the test. This
- * helps avoid large memory block allocations to be used for mapping
- * at higher page table levels in some of the tests.
- */
- phys = __pa_symbol(&start_kernel);
- args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK);
- args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK);
- args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK);
- args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK);
- args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK);
- WARN_ON(!pfn_valid(args->fixed_pte_pfn));
+ init_fixed_pfns(args);
/*
* Allocate (huge) pages because some of the tests need to access
diff --git a/mm/fadvise.c b/mm/fadvise.c
index bf04fec87f35..fb7c5f43fd2a 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
spin_lock(&file->f_lock);
- file->f_mode &= ~FMODE_RANDOM;
+ file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_RANDOM:
@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
force_page_cache_readahead(mapping, file, start_index, nrpages);
break;
case POSIX_FADV_NOREUSE:
+ spin_lock(&file->f_lock);
+ file->f_mode |= FMODE_NOREUSE;
+ spin_unlock(&file->f_lock);
break;
case POSIX_FADV_DONTNEED:
__filemap_fdatawrite_range(mapping, offset, endbyte,
diff --git a/mm/hmm.c b/mm/hmm.c
index 601a99ce3c84..6a151c09de5e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -492,8 +492,21 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault) {
+ int ret;
+
spin_unlock(ptl);
- return hmm_vma_fault(addr, end, required_fault, walk);
+ hugetlb_vma_unlock_read(vma);
+ /*
+ * Avoid deadlock: drop the vma lock before calling
+ * hmm_vma_fault(), which will itself potentially take and
+ * drop the vma lock. This is also correct from a
+ * protection point of view, because there is no further
+ * use here of either pte or ptl after dropping the vma
+ * lock.
+ */
+ ret = hmm_vma_fault(addr, end, required_fault, walk);
+ hugetlb_vma_lock_read(vma);
+ return ret;
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index abe6cfd92ffa..7e68a36b4f7d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1603,7 +1603,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
{
spinlock_t *ptl;
pmd_t orig_pmd;
- struct page *page;
+ struct folio *folio;
struct mm_struct *mm = tlb->mm;
bool ret = false;
@@ -1623,15 +1623,15 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto out;
}
- page = pmd_page(orig_pmd);
+ folio = pfn_folio(pmd_pfn(orig_pmd));
/*
- * If other processes are mapping this page, we couldn't discard
- * the page unless they all do MADV_FREE so let's skip the page.
+ * If other processes are mapping this folio, we couldn't discard
+ * the folio unless they all do MADV_FREE so let's skip the folio.
*/
- if (total_mapcount(page) != 1)
+ if (folio_mapcount(folio) != 1)
goto out;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto out;
/*
@@ -1639,17 +1639,17 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* will deactivate only them.
*/
if (next - addr != HPAGE_PMD_SIZE) {
- get_page(page);
+ folio_get(folio);
spin_unlock(ptl);
- split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
goto out_unlocked;
}
- if (PageDirty(page))
- ClearPageDirty(page);
- unlock_page(page);
+ if (folio_test_dirty(folio))
+ folio_clear_dirty(folio);
+ folio_unlock(folio);
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
pmdp_invalidate(vma, addr, pmd);
@@ -1660,7 +1660,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
- mark_page_lazyfree(page);
+ folio_mark_lazyfree(folio);
ret = true;
out:
spin_unlock(ptl);
@@ -1920,17 +1920,15 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
entry = pmd_modify(oldpmd, newprot);
- if (uffd_wp) {
- entry = pmd_wrprotect(entry);
+ if (uffd_wp)
entry = pmd_mkuffd_wp(entry);
- } else if (uffd_wp_resolve) {
+ else if (uffd_wp_resolve)
/*
* Leave the write bit to be handled by PF interrupt
* handler, then things like COW could be properly
* handled.
*/
entry = pmd_clear_uffd_wp(entry);
- }
/* See change_pte_range(). */
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
@@ -2837,6 +2835,9 @@ void deferred_split_huge_page(struct page *page)
if (PageSwapCache(page))
return;
+ if (!list_empty(page_deferred_list(page)))
+ return;
+
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
@@ -2934,6 +2935,7 @@ static void split_huge_pages_all(void)
{
struct zone *zone;
struct page *page;
+ struct folio *folio;
unsigned long pfn, max_zone_pfn;
unsigned long total = 0, split = 0;
@@ -2946,24 +2948,32 @@ static void split_huge_pages_all(void)
int nr_pages;
page = pfn_to_online_page(pfn);
- if (!page || !get_page_unless_zero(page))
+ if (!page || PageTail(page))
+ continue;
+ folio = page_folio(page);
+ if (!folio_try_get(folio))
continue;
- if (zone != page_zone(page))
+ if (unlikely(page_folio(page) != folio))
goto next;
- if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
+ if (zone != folio_zone(folio))
+ goto next;
+
+ if (!folio_test_large(folio)
+ || folio_test_hugetlb(folio)
+ || !folio_test_lru(folio))
goto next;
total++;
- lock_page(page);
- nr_pages = thp_nr_pages(page);
- if (!split_huge_page(page))
+ folio_lock(folio);
+ nr_pages = folio_nr_pages(folio);
+ if (!split_folio(folio))
split++;
pfn += nr_pages - 1;
- unlock_page(page);
+ folio_unlock(folio);
next:
- put_page(page);
+ folio_put(folio);
cond_resched();
}
}
@@ -3275,7 +3285,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (is_writable_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
- pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
+ pmde = pmd_mkuffd_wp(pmde);
if (!is_migration_entry_young(entry))
pmde = pmd_mkold(pmde);
/* NOTE: this may contain setting soft-dirty on some archs */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bdbfeb6fb393..6fe65f14d33b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -260,12 +260,6 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
/*
* hugetlb vma_lock helper routines
*/
-static bool __vma_shareable_lock(struct vm_area_struct *vma)
-{
- return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
- vma->vm_private_data;
-}
-
void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
if (__vma_shareable_lock(vma)) {
@@ -1492,7 +1486,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
set_page_refcounted(p);
}
- folio_set_compound_order(folio, 0);
+ folio_set_order(folio, 0);
__folio_clear_head(folio);
}
@@ -1956,7 +1950,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
__folio_clear_reserved(folio);
__folio_set_head(folio);
/* we rely on prep_new_hugetlb_folio to set the destructor */
- folio_set_compound_order(folio, order);
+ folio_set_order(folio, order);
for (i = 0; i < nr_pages; i++) {
p = folio_page(folio, i);
@@ -2020,7 +2014,7 @@ out_error:
p = folio_page(folio, j);
__ClearPageReserved(p);
}
- folio_set_compound_order(folio, 0);
+ folio_set_order(folio, 0);
__folio_clear_head(folio);
return false;
}
@@ -4981,7 +4975,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
} else {
/*
* For shared mappings the vma lock must be held before
- * calling huge_pte_offset in the src vma. Otherwise, the
+ * calling hugetlb_walk() in the src vma. Otherwise, the
* returned ptep could go away if part of a shared pmd and
* another thread calls huge_pmd_unshare.
*/
@@ -4991,7 +4985,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
last_addr_mask = hugetlb_mask_last_page(h);
for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
- src_pte = huge_pte_offset(src, addr, sz);
+ src_pte = hugetlb_walk(src_vma, addr, sz);
if (!src_pte) {
addr |= last_addr_mask;
continue;
@@ -5198,7 +5192,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
- src_pte = huge_pte_offset(mm, old_addr, sz);
+ src_pte = hugetlb_walk(vma, old_addr, sz);
if (!src_pte) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
@@ -5261,7 +5255,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
- ptep = huge_pte_offset(mm, address, sz);
+ ptep = hugetlb_walk(vma, address, sz);
if (!ptep) {
address |= last_addr_mask;
continue;
@@ -5574,7 +5568,7 @@ retry_avoidcopy:
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -5612,7 +5606,7 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
@@ -5919,7 +5913,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* if populated.
*/
if (unlikely(pte_marker_uffd_wp(old_pte)))
- new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
+ new_pte = huge_pte_mkuffd_wp(new_pte);
set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
@@ -5994,22 +5988,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
- if (ptep) {
- /*
- * Since we hold no locks, ptep could be stale. That is
- * OK as we are only making decisions based on content and
- * not actually modifying content here.
- */
- entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, ptep);
- return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
- }
-
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
@@ -6024,10 +6002,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Acquire vma lock before calling huge_pte_alloc and hold
* until finished with ptep. This prevents huge_pmd_unshare from
* being called elsewhere and making the ptep no longer valid.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
- * something has changed.
*/
hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
@@ -6056,8 +6030,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it.
*/
- if (!pte_present(entry))
+ if (!pte_present(entry)) {
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ /*
+ * Release the hugetlb fault lock now, but retain
+ * the vma lock, because it is needed to guard the
+ * huge_pte_lockptr() later in
+ * migration_entry_wait_huge(). The vma lock will
+ * be released there.
+ */
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ migration_entry_wait_huge(vma, ptep);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
+ }
/*
* If we are going to COW/unshare the mapping later, we examine the
@@ -6402,10 +6391,10 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
if (WARN_ON_ONCE(flags & FOLL_PIN))
return NULL;
-retry:
- pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+ hugetlb_vma_lock_read(vma);
+ pte = hugetlb_walk(vma, haddr, huge_page_size(h));
if (!pte)
- return NULL;
+ goto out_unlock;
ptl = huge_pte_lock(h, mm, pte);
entry = huge_ptep_get(pte);
@@ -6425,19 +6414,11 @@ retry:
page = NULL;
goto out;
}
- } else {
- if (is_hugetlb_entry_migration(entry)) {
- spin_unlock(ptl);
- __migration_entry_wait_huge(pte, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
}
out:
spin_unlock(ptl);
+out_unlock:
+ hugetlb_vma_unlock_read(vma);
return page;
}
@@ -6468,6 +6449,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
+ hugetlb_vma_lock_read(vma);
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
@@ -6475,8 +6457,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*
* Note that page table lock is not held when pte is null.
*/
- pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
- huge_page_size(h));
+ pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
+ huge_page_size(h));
if (pte)
ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -6492,6 +6474,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
break;
}
@@ -6513,6 +6496,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
+
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
else if (unshare)
@@ -6575,6 +6560,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
remainder -= pages_per_huge_page(h);
i += pages_per_huge_page(h);
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
continue;
}
@@ -6604,6 +6590,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
flags))) {
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
err = -ENOMEM;
break;
@@ -6615,6 +6602,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
i += refs;
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
}
*nr_pages = remainder;
/*
@@ -6627,7 +6615,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
return i ? i : err;
}
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6636,7 +6624,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
- unsigned long pages = 0, psize = huge_page_size(h);
+ long pages = 0, psize = huge_page_size(h);
bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
@@ -6661,7 +6649,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
spinlock_t *ptl;
- ptep = huge_pte_offset(mm, address, psize);
+ ptep = hugetlb_walk(vma, address, psize);
if (!ptep) {
if (!uffd_wp) {
address |= last_addr_mask;
@@ -6672,8 +6660,10 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* pre-allocations to install pte markers.
*/
ptep = huge_pte_alloc(mm, vma, address, psize);
- if (!ptep)
+ if (!ptep) {
+ pages = -ENOMEM;
break;
+ }
}
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, address, ptep)) {
@@ -6728,7 +6718,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_modify(old_pte, newprot);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
if (uffd_wp)
- pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
+ pte = huge_pte_mkuffd_wp(pte);
else if (uffd_wp_resolve)
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
@@ -6763,7 +6753,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
- return pages << h->order;
+ return pages > 0 ? (pages << h->order) : pages;
}
/* Return true if reservation was successful, false otherwise. */
@@ -6772,7 +6762,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long chg, add = -1;
+ long chg = -1, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -7071,8 +7061,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr,
- vma_mmu_pagesize(svma));
+ spte = hugetlb_walk(svma, saddr,
+ vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
break;
@@ -7384,7 +7374,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
- ptep = huge_pte_offset(mm, address, sz);
+ ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
diff --git a/mm/internal.h b/mm/internal.h
index bcf75a8b032d..1d6f4e168510 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -378,6 +378,25 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
int split_free_page(struct page *free_page,
unsigned int order, unsigned long split_pfn_offset);
+/*
+ * This will have no effect, other than possibly generating a warning, if the
+ * caller passes in a non-large folio.
+ */
+static inline void folio_set_order(struct folio *folio, unsigned int order)
+{
+ if (WARN_ON_ONCE(!folio_test_large(folio)))
+ return;
+
+ folio->_folio_order = order;
+#ifdef CONFIG_64BIT
+ /*
+ * When hugetlb dissolves a folio, we need to clear the tail
+ * page, rather than setting nr_pages to 1.
+ */
+ folio->_folio_nr_pages = order ? 1U << order : 0;
+#endif
+}
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 833bf2cfd2a3..6b8e9c848573 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -95,19 +95,24 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
}
#endif /* CONFIG_KASAN_STACK */
-void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
+bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
{
u8 tag;
unsigned long i;
if (unlikely(PageHighMem(page)))
- return;
+ return false;
+
+ if (!kasan_sample_page_alloc(order))
+ return false;
tag = kasan_random_tag();
kasan_unpoison(set_tag(page_address(page), tag),
PAGE_SIZE << order, init);
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
+
+ return true;
}
void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
@@ -117,11 +122,6 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
KASAN_PAGE_FREE, init);
}
-void __kasan_cache_create_kmalloc(struct kmem_cache *cache)
-{
- cache->kasan_info.is_kmalloc = true;
-}
-
void __kasan_poison_slab(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -321,7 +321,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
kasan_unpoison(tagged_object, cache->object_size, init);
/* Save alloc info (if possible) for non-kmalloc() allocations. */
- if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc)
+ if (kasan_stack_collection_enabled() && !is_kmalloc_cache(cache))
kasan_save_alloc_info(cache, tagged_object, flags);
return tagged_object;
@@ -367,7 +367,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache,
* Save alloc info (if possible) for kmalloc() allocations.
* This also rewrites the alloc info when called from kasan_krealloc().
*/
- if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc)
+ if (kasan_stack_collection_enabled() && is_kmalloc_cache(cache))
kasan_save_alloc_info(cache, (void *)object, flags);
/* Keep the tag that was set by kasan_slab_alloc(). */
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index b22c4f461cb0..d1bcb0205327 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -59,6 +59,24 @@ EXPORT_SYMBOL_GPL(kasan_mode);
/* Whether to enable vmalloc tagging. */
DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+#define PAGE_ALLOC_SAMPLE_DEFAULT 1
+#define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3
+
+/*
+ * Sampling interval of page_alloc allocation (un)poisoning.
+ * Defaults to no sampling.
+ */
+unsigned long kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT;
+
+/*
+ * Minimum order of page_alloc allocations to be affected by sampling.
+ * The default value is chosen to match both
+ * PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER.
+ */
+unsigned int kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT;
+
+DEFINE_PER_CPU(long, kasan_page_alloc_skip);
+
/* kasan=off/on */
static int __init early_kasan_flag(char *arg)
{
@@ -122,6 +140,48 @@ static inline const char *kasan_mode_info(void)
return "sync";
}
+/* kasan.page_alloc.sample=<sampling interval> */
+static int __init early_kasan_flag_page_alloc_sample(char *arg)
+{
+ int rv;
+
+ if (!arg)
+ return -EINVAL;
+
+ rv = kstrtoul(arg, 0, &kasan_page_alloc_sample);
+ if (rv)
+ return rv;
+
+ if (!kasan_page_alloc_sample || kasan_page_alloc_sample > LONG_MAX) {
+ kasan_page_alloc_sample = PAGE_ALLOC_SAMPLE_DEFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample);
+
+/* kasan.page_alloc.sample.order=<minimum page order> */
+static int __init early_kasan_flag_page_alloc_sample_order(char *arg)
+{
+ int rv;
+
+ if (!arg)
+ return -EINVAL;
+
+ rv = kstrtouint(arg, 0, &kasan_page_alloc_sample_order);
+ if (rv)
+ return rv;
+
+ if (kasan_page_alloc_sample_order > INT_MAX) {
+ kasan_page_alloc_sample_order = PAGE_ALLOC_SAMPLE_ORDER_DEFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order);
+
/*
* kasan_init_hw_tags_cpu() is called for each CPU.
* Not marked as __init as a CPU can be hot-plugged after boot.
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ea8cf1310b1e..32413f22aa82 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -42,6 +42,10 @@ enum kasan_mode {
extern enum kasan_mode kasan_mode __ro_after_init;
+extern unsigned long kasan_page_alloc_sample;
+extern unsigned int kasan_page_alloc_sample_order;
+DECLARE_PER_CPU(long, kasan_page_alloc_skip);
+
static inline bool kasan_vmalloc_enabled(void)
{
return static_branch_likely(&kasan_flag_vmalloc);
@@ -57,6 +61,24 @@ static inline bool kasan_sync_fault_possible(void)
return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
}
+static inline bool kasan_sample_page_alloc(unsigned int order)
+{
+ /* Fast-path for when sampling is disabled. */
+ if (kasan_page_alloc_sample == 1)
+ return true;
+
+ if (order < kasan_page_alloc_sample_order)
+ return true;
+
+ if (this_cpu_dec_return(kasan_page_alloc_skip) < 0) {
+ this_cpu_write(kasan_page_alloc_skip,
+ kasan_page_alloc_sample - 1);
+ return true;
+ }
+
+ return false;
+}
+
#else /* CONFIG_KASAN_HW_TAGS */
static inline bool kasan_async_fault_possible(void)
@@ -69,6 +91,11 @@ static inline bool kasan_sync_fault_possible(void)
return true;
}
+static inline bool kasan_sample_page_alloc(unsigned int order)
+{
+ return true;
+}
+
#endif /* CONFIG_KASAN_HW_TAGS */
#ifdef CONFIG_KASAN_GENERIC
diff --git a/mm/madvise.c b/mm/madvise.c
index b6ea204d4e23..e407d335e614 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -345,8 +345,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
struct vm_area_struct *vma = walk->vma;
pte_t *orig_pte, *pte, ptent;
spinlock_t *ptl;
- struct page *page = NULL;
- LIST_HEAD(page_list);
+ struct folio *folio = NULL;
+ LIST_HEAD(folio_list);
bool pageout_anon_only_filter;
if (fatal_signal_pending(current))
@@ -375,26 +375,26 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
goto huge_unlock;
}
- page = pmd_page(orig_pmd);
+ folio = pfn_folio(pmd_pfn(orig_pmd));
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /* Do not interfere with other mappings of this folio */
+ if (folio_mapcount(folio) != 1)
goto huge_unlock;
- if (pageout_anon_only_filter && !PageAnon(page))
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
goto huge_unlock;
if (next - addr != HPAGE_PMD_SIZE) {
int err;
- get_page(page);
+ folio_get(folio);
spin_unlock(ptl);
- lock_page(page);
- err = split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
if (!err)
- goto regular_page;
+ goto regular_folio;
return 0;
}
@@ -406,25 +406,25 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
if (pageout) {
- if (!isolate_lru_page(page)) {
- if (PageUnevictable(page))
- putback_lru_page(page);
+ if (!folio_isolate_lru(folio)) {
+ if (folio_test_unevictable(folio))
+ folio_putback_lru(folio);
else
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &folio_list);
}
} else
- deactivate_page(page);
+ folio_deactivate(folio);
huge_unlock:
spin_unlock(ptl);
if (pageout)
- reclaim_pages(&page_list);
+ reclaim_pages(&folio_list);
return 0;
}
-regular_page:
+regular_folio:
if (pmd_trans_unstable(pmd))
return 0;
#endif
@@ -441,33 +441,33 @@ regular_page:
if (!pte_present(ptent))
continue;
- page = vm_normal_page(vma, addr, ptent);
- if (!page || is_zone_device_page(page))
+ folio = vm_normal_folio(vma, addr, ptent);
+ if (!folio || folio_is_zone_device(folio))
continue;
/*
* Creating a THP page is expensive so split it only if we
* are sure it's worth. Split it if we are only owner.
*/
- if (PageTransCompound(page)) {
- if (page_mapcount(page) != 1)
+ if (folio_test_large(folio)) {
+ if (folio_mapcount(folio) != 1)
break;
- if (pageout_anon_only_filter && !PageAnon(page))
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
break;
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
+ folio_get(folio);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
break;
}
pte_unmap_unlock(orig_pte, ptl);
- if (split_huge_page(page)) {
- unlock_page(page);
- put_page(page);
+ if (split_folio(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
break;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
@@ -475,16 +475,16 @@ regular_page:
}
/*
- * Do not interfere with other mappings of this page and
- * non-LRU page.
+ * Do not interfere with other mappings of this folio and
+ * non-LRU folio.
*/
- if (!PageLRU(page) || page_mapcount(page) != 1)
+ if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
continue;
- if (pageout_anon_only_filter && !PageAnon(page))
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
if (pte_young(ptent)) {
ptent = ptep_get_and_clear_full(mm, addr, pte,
@@ -495,28 +495,28 @@ regular_page:
}
/*
- * We are deactivating a page for accelerating reclaiming.
- * VM couldn't reclaim the page unless we clear PG_young.
+ * We are deactivating a folio for accelerating reclaiming.
+ * VM couldn't reclaim the folio unless we clear PG_young.
* As a side effect, it makes confuse idle-page tracking
* because they will miss recent referenced history.
*/
- ClearPageReferenced(page);
- test_and_clear_page_young(page);
+ folio_clear_referenced(folio);
+ folio_test_clear_young(folio);
if (pageout) {
- if (!isolate_lru_page(page)) {
- if (PageUnevictable(page))
- putback_lru_page(page);
+ if (!folio_isolate_lru(folio)) {
+ if (folio_test_unevictable(folio))
+ folio_putback_lru(folio);
else
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &folio_list);
}
} else
- deactivate_page(page);
+ folio_deactivate(folio);
}
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_pte, ptl);
if (pageout)
- reclaim_pages(&page_list);
+ reclaim_pages(&folio_list);
cond_resched();
return 0;
@@ -728,7 +728,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(mm, addr, pte, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
}
- mark_page_lazyfree(&folio->page);
+ folio_mark_lazyfree(folio);
}
out:
if (nr_swap) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 73afff8062f9..faeea84964aa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -477,6 +477,16 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;
+ if (lru_gen_enabled()) {
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
+
+ return;
+ }
+
mctz = soft_limit_tree.rb_tree_per_node[nid];
if (!mctz)
return;
@@ -2939,13 +2949,13 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
}
/*
- * page_memcg_check() is used here, because in theory we can encounter
+ * folio_memcg_check() is used here, because in theory we can encounter
* a folio where the slab flag has been cleared already, but
* slab->memcg_data has not been freed yet
- * page_memcg_check(page) will guarantee that a proper memory
+ * folio_memcg_check() will guarantee that a proper memory
* cgroup pointer or NULL will be returned.
*/
- return page_memcg_check(folio_page(folio, 0));
+ return folio_memcg_check(folio);
}
/*
@@ -3526,6 +3536,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
+ if (lru_gen_enabled())
+ return 0;
+
if (order > 0)
return 0;
@@ -3914,6 +3927,10 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
+
if (val & ~MOVE_MASK)
return -EINVAL;
@@ -5382,6 +5399,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (unlikely(mem_cgroup_is_root(memcg)))
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
2UL*HZ);
+ lru_gen_online_memcg(memcg);
return 0;
offline_kmem:
memcg_offline_kmem(memcg);
@@ -5413,6 +5431,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
+ lru_gen_offline_memcg(memcg);
drain_all_stock(memcg);
@@ -5424,6 +5443,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
invalidate_reclaim_iterators(memcg);
+ lru_gen_release_memcg(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -5687,7 +5707,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
- * The caller must make sure the page is not on LRU (isolate_page() is useful.)
+ * The page must be locked and not on the LRU.
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
@@ -5704,20 +5724,13 @@ static int mem_cgroup_move_account(struct page *page,
int nid, ret;
VM_BUG_ON(from == to);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
VM_BUG_ON(compound && !folio_test_large(folio));
- /*
- * Prevent mem_cgroup_migrate() from looking at
- * page's memory cgroup of its source page while we change it.
- */
- ret = -EBUSY;
- if (!folio_trylock(folio))
- goto out;
-
ret = -EINVAL;
if (folio_memcg(folio) != from)
- goto out_unlock;
+ goto out;
pgdat = folio_pgdat(folio);
from_vec = mem_cgroup_lruvec(from, pgdat);
@@ -5804,8 +5817,6 @@ static int mem_cgroup_move_account(struct page *page,
mem_cgroup_charge_statistics(from, -nr_pages);
memcg_check_events(from, nid);
local_irq_enable();
-out_unlock:
- folio_unlock(folio);
out:
return ret;
}
@@ -5854,6 +5865,29 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
+ if (target && page) {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return ret;
+ }
+ /*
+ * page_mapped() must be stable during the move. This
+ * pte is locked, so if it's present, the page cannot
+ * become unmapped. If it isn't, we have only partial
+ * control over the mapped state: the page lock will
+ * prevent new faults against pagecache and swapcache,
+ * so an unmapped page cannot become mapped. However,
+ * if the page is already mapped elsewhere, it can
+ * unmap, and there is nothing we can do about it.
+ * Alas, skip moving the page in this case.
+ */
+ if (!pte_present(ptent) && page_mapped(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ret;
+ }
+ }
+
if (!page && !ent.val)
return ret;
if (page) {
@@ -5870,8 +5904,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (target)
target->page = page;
}
- if (!ret || !target)
+ if (!ret || !target) {
+ if (target)
+ unlock_page(page);
put_page(page);
+ }
}
/*
* There is a swap entry and a page doesn't exist or isn't charged.
@@ -5911,6 +5948,10 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ return MC_TARGET_NONE;
+ }
target->page = page;
}
}
@@ -6149,6 +6190,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
putback_lru_page(page);
}
+ unlock_page(page);
put_page(page);
} else if (target_type == MC_TARGET_DEVICE) {
page = target.page;
@@ -6157,6 +6199,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
}
+ unlock_page(page);
put_page(page);
}
spin_unlock(ptl);
@@ -6199,7 +6242,8 @@ retry:
}
if (!device)
putback_lru_page(page);
-put: /* get_mctgt_type() gets the page */
+put: /* get_mctgt_type() gets & locks the page */
+ unlock_page(page);
put_page(page);
break;
case MC_TARGET_SWAP:
diff --git a/mm/memfd.c b/mm/memfd.c
index 08f5f8304746..a0a7a37e8177 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -18,6 +18,7 @@
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
+#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
/*
@@ -147,6 +148,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
}
#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_EXEC | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
F_SEAL_WRITE | \
@@ -175,6 +177,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
* SEAL_SHRINK: Prevent the file from shrinking
* SEAL_GROW: Prevent the file from growing
* SEAL_WRITE: Prevent write access to the file
+ * SEAL_EXEC: Prevent modification of the exec bits in the file mode
*
* As we don't require any trust relationship between two parties, we
* must prevent seals from being removed. Therefore, sealing a file
@@ -219,6 +222,12 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
}
}
+ /*
+ * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
+ */
+ if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
+ seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
+
*file_seals |= seals;
error = 0;
@@ -261,12 +270,13 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
+ char comm[TASK_COMM_LEN];
unsigned int *file_seals;
struct file *file;
int fd, error;
@@ -283,6 +293,40 @@ SYSCALL_DEFINE2(memfd_create,
return -EINVAL;
}
+ /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
+ if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
+ return -EINVAL;
+
+ if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
+#ifdef CONFIG_SYSCTL
+ int sysctl = MEMFD_NOEXEC_SCOPE_EXEC;
+ struct pid_namespace *ns;
+
+ ns = task_active_pid_ns(current);
+ if (ns)
+ sysctl = ns->memfd_noexec_scope;
+
+ switch (sysctl) {
+ case MEMFD_NOEXEC_SCOPE_EXEC:
+ flags |= MFD_EXEC;
+ break;
+ case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL:
+ flags |= MFD_NOEXEC_SEAL;
+ break;
+ default:
+ pr_warn_once(
+ "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n",
+ task_pid_nr(current), get_task_comm(comm, current));
+ return -EINVAL;
+ }
+#else
+ flags |= MFD_EXEC;
+#endif
+ pr_warn_once(
+ "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n",
+ task_pid_nr(current), get_task_comm(comm, current));
+ }
+
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
if (len <= 0)
@@ -326,7 +370,15 @@ SYSCALL_DEFINE2(memfd_create,
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
- if (flags & MFD_ALLOW_SEALING) {
+ if (flags & MFD_NOEXEC_SEAL) {
+ struct inode *inode = file_inode(file);
+
+ inode->i_mode &= ~0111;
+ file_seals = memfd_file_seals_ptr(file);
+ *file_seals &= ~F_SEAL_SEAL;
+ *file_seals |= F_SEAL_EXEC;
+ } else if (flags & MFD_ALLOW_SEALING) {
+ /* MFD_EXEC and MFD_ALLOW_SEALING are set */
file_seals = memfd_file_seals_ptr(file);
*file_seals &= ~F_SEAL_SEAL;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c77a9e37e27e..6bf07345ea2c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -24,7 +24,7 @@
* - You have a test that can be added to mce-test
* https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
* - The case actually shows up as a frequent (top 10) page state in
- * tools/vm/page-types when running a real workload.
+ * tools/mm/page-types when running a real workload.
*
* There are several operations here with exponential complexity because
* of unsuitable VM data structures. For example the operation to map back
diff --git a/mm/memory.c b/mm/memory.c
index 3e836fecd035..90f8f72777c7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -625,6 +625,16 @@ out:
return pfn_to_page(pfn);
}
+struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
+{
+ struct page *page = vm_normal_page(vma, addr, pte);
+
+ if (page)
+ return page_folio(page);
+ return NULL;
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
@@ -878,7 +888,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
if (userfaultfd_pte_wp(dst_vma, *src_pte))
/* Uffd-wp needs to be delivered to dest pte as well */
- pte = pte_wrprotect(pte_mkuffd_wp(pte));
+ pte = pte_mkuffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
@@ -1392,8 +1402,7 @@ again:
force_flush = 1;
}
}
- if (pte_young(ptent) &&
- likely(!(vma->vm_flags & VM_SEQ_READ)))
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
@@ -1684,36 +1693,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
}
/**
- * zap_page_range - remove user pages in a given range
- * @vma: vm_area_struct holding the applicable pages
- * @start: starting address of pages to zap
- * @size: number of bytes to zap
- *
- * Caller must protect the VMA list
- */
-void zap_page_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long size)
-{
- struct maple_tree *mt = &vma->vm_mm->mm_mt;
- unsigned long end = start + size;
- struct mmu_notifier_range range;
- struct mmu_gather tlb;
- MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
-
- lru_add_drain();
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- start, start + size);
- tlb_gather_mmu(&tlb, vma->vm_mm);
- update_hiwater_rss(vma->vm_mm);
- mmu_notifier_invalidate_range_start(&range);
- do {
- unmap_single_vma(&tlb, vma, start, range.end, NULL);
- } while ((vma = mas_find(&mas, end - 1)) != NULL);
- mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
-}
-
-/**
* zap_page_range_single - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
@@ -3950,10 +3929,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(vmf->orig_pte)) {
+ if (pte_swp_uffd_wp(vmf->orig_pte))
pte = pte_mkuffd_wp(pte);
- pte = pte_wrprotect(pte);
- }
vmf->orig_pte = pte;
/* ksm created a completely new copy */
@@ -4296,7 +4273,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (unlikely(uffd_wp))
- entry = pte_mkuffd_wp(pte_wrprotect(entry));
+ entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
@@ -5137,8 +5114,8 @@ static inline void mm_account_fault(struct pt_regs *regs,
#ifdef CONFIG_LRU_GEN
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
- /* the LRU algorithm doesn't apply to sequential or random reads */
- current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
+ /* the LRU algorithm only applies to accesses with recency */
+ current->in_lru_fault = vma_has_recency(vma);
}
static void lru_gen_exit_fault(void)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f940395667c8..72142fbe7652 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -632,13 +632,12 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
struct mmu_gather tlb;
- int nr_updated;
+ long nr_updated;
tlb_gather_mmu(&tlb, vma->vm_mm);
- nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE,
- MM_CP_PROT_NUMA);
- if (nr_updated)
+ nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
+ if (nr_updated > 0)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
tlb_finish_mmu(&tlb);
@@ -1490,7 +1489,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- struct mempolicy *new;
+ struct mempolicy *new, *old;
unsigned long vmstart;
unsigned long vmend;
unsigned long end;
@@ -1522,31 +1521,27 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
return 0;
mmap_write_lock(mm);
for_each_vma_range(vmi, vma, end) {
- vmstart = max(start, vma->vm_start);
- vmend = min(end, vma->vm_end);
- new = mpol_dup(vma_policy(vma));
- if (IS_ERR(new)) {
- err = PTR_ERR(new);
- break;
- }
- /*
- * Only update home node if there is an existing vma policy
- */
- if (!new)
- continue;
-
/*
* If any vma in the range got policy other than MPOL_BIND
* or MPOL_PREFERRED_MANY we return error. We don't reset
* the home node for vmas we already updated before.
*/
- if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
- mpol_put(new);
+ old = vma_policy(vma);
+ if (!old)
+ continue;
+ if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
err = -EOPNOTSUPP;
break;
}
+ new = mpol_dup(old);
+ if (IS_ERR(new)) {
+ err = PTR_ERR(new);
+ break;
+ }
new->home_node = home_node;
+ vmstart = max(start, vma->vm_start);
+ vmend = min(end, vma->vm_end);
err = mbind_range(mm, vmstart, vmend, new);
mpol_put(new);
if (err)
diff --git a/mm/migrate.c b/mm/migrate.c
index a4d3fc65085f..98de7ce2b576 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -329,24 +329,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
}
#ifdef CONFIG_HUGETLB_PAGE
-void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
+/*
+ * The vma read lock must be held upon entry. Holding that lock prevents either
+ * the pte or the ptl from being freed.
+ *
+ * This function will release the vma lock before returning.
+ */
+void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl)
{
pte_t pte;
+ hugetlb_vma_assert_locked(vma);
spin_lock(ptl);
pte = huge_ptep_get(ptep);
- if (unlikely(!is_hugetlb_entry_migration(pte)))
+ if (unlikely(!is_hugetlb_entry_migration(pte))) {
spin_unlock(ptl);
- else
+ hugetlb_vma_unlock_read(vma);
+ } else {
+ /*
+ * If migration entry existed, safe to release vma lock
+ * here because the pgtable page won't be freed without the
+ * pgtable lock released. See comment right above pgtable
+ * lock release in migration_entry_wait_on_locked().
+ */
+ hugetlb_vma_unlock_read(vma);
migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
+ }
}
void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
{
spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
- __migration_entry_wait_huge(pte, ptl);
+ __migration_entry_wait_huge(vma, pte, ptl);
}
#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 61cf60015a8b..92fc6f3fa512 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -80,13 +80,13 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
return pte_dirty(pte);
}
-static unsigned long change_pte_range(struct mmu_gather *tlb,
+static long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pte_t *pte, oldpte;
spinlock_t *ptl;
- unsigned long pages = 0;
+ long pages = 0;
int target_node = NUMA_NO_NODE;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
@@ -177,12 +177,10 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
oldpte = ptep_modify_prot_start(vma, addr, pte);
ptent = pte_modify(oldpte, newprot);
- if (uffd_wp) {
- ptent = pte_wrprotect(ptent);
+ if (uffd_wp)
ptent = pte_mkuffd_wp(ptent);
- } else if (uffd_wp_resolve) {
+ else if (uffd_wp_resolve)
ptent = pte_clear_uffd_wp(ptent);
- }
/*
* In some writable, shared mappings, we might want
@@ -332,36 +330,42 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
/*
* If wr-protecting the range for file-backed, populate pgtable for the case
* when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc()
- * failed it means no memory, we don't have a better option but stop.
+ * failed we treat it the same way as pgtable allocation failures during
+ * page faults by kicking OOM and returning error.
*/
#define change_pmd_prepare(vma, pmd, cp_flags) \
- do { \
+ ({ \
+ long err = 0; \
if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
- if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \
- break; \
+ if (pte_alloc(vma->vm_mm, pmd)) \
+ err = -ENOMEM; \
} \
- } while (0)
+ err; \
+ })
+
/*
* This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to
* have separate change_pmd_prepare() because pte_alloc() returns 0 on success,
* while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
*/
#define change_prepare(vma, high, low, addr, cp_flags) \
- do { \
+ ({ \
+ long err = 0; \
if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
- if (WARN_ON_ONCE(p == NULL)) \
- break; \
+ if (p == NULL) \
+ err = -ENOMEM; \
} \
- } while (0)
+ err; \
+ })
-static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
+static inline long change_pmd_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pmd_t *pmd;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0;
unsigned long nr_huge_updates = 0;
struct mmu_notifier_range range;
@@ -369,11 +373,15 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
- unsigned long this_pages;
+ long ret;
next = pmd_addr_end(addr, end);
- change_pmd_prepare(vma, pmd, cp_flags);
+ ret = change_pmd_prepare(vma, pmd, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
/*
* Automatic NUMA balancing walks the tables with mmap_lock
* held for read. It's possible a parallel update to occur
@@ -403,7 +411,11 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
* cleared; make sure pmd populated if
* necessary, then fall-through to pte level.
*/
- change_pmd_prepare(vma, pmd, cp_flags);
+ ret = change_pmd_prepare(vma, pmd, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
} else {
/*
* change_huge_pmd() does not defer TLB flushes,
@@ -424,9 +436,8 @@ static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
}
/* fall through, the trans huge pmd just split */
}
- this_pages = change_pte_range(tlb, vma, pmd, addr, next,
- newprot, cp_flags);
- pages += this_pages;
+ pages += change_pte_range(tlb, vma, pmd, addr, next,
+ newprot, cp_flags);
next:
cond_resched();
} while (pmd++, addr = next, addr != end);
@@ -439,18 +450,20 @@ next:
return pages;
}
-static inline unsigned long change_pud_range(struct mmu_gather *tlb,
+static inline long change_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pud_t *pud;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- change_prepare(vma, pud, pmd, addr, cp_flags);
+ ret = change_prepare(vma, pud, pmd, addr, cp_flags);
+ if (ret)
+ return ret;
if (pud_none_or_clear_bad(pud))
continue;
pages += change_pmd_range(tlb, vma, pud, addr, next, newprot,
@@ -460,18 +473,20 @@ static inline unsigned long change_pud_range(struct mmu_gather *tlb,
return pages;
}
-static inline unsigned long change_p4d_range(struct mmu_gather *tlb,
+static inline long change_p4d_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
p4d_t *p4d;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
- change_prepare(vma, p4d, pud, addr, cp_flags);
+ ret = change_prepare(vma, p4d, pud, addr, cp_flags);
+ if (ret)
+ return ret;
if (p4d_none_or_clear_bad(p4d))
continue;
pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
@@ -481,21 +496,25 @@ static inline unsigned long change_p4d_range(struct mmu_gather *tlb,
return pages;
}
-static unsigned long change_protection_range(struct mmu_gather *tlb,
+static long change_protection_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
- unsigned long pages = 0;
+ long pages = 0, ret;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
tlb_start_vma(tlb, vma);
do {
next = pgd_addr_end(addr, end);
- change_prepare(vma, pgd, p4d, addr, cp_flags);
+ ret = change_prepare(vma, pgd, p4d, addr, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
if (pgd_none_or_clear_bad(pgd))
continue;
pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
@@ -507,15 +526,27 @@ static unsigned long change_protection_range(struct mmu_gather *tlb,
return pages;
}
-unsigned long change_protection(struct mmu_gather *tlb,
+long change_protection(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgprot_t newprot,
- unsigned long cp_flags)
+ unsigned long end, unsigned long cp_flags)
{
- unsigned long pages;
+ pgprot_t newprot = vma->vm_page_prot;
+ long pages;
BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * Ordinary protection updates (mprotect, uffd-wp, softdirty tracking)
+ * are expected to reflect their requirements via VMA flags such that
+ * vma_set_page_prot() will adjust vma->vm_page_prot accordingly.
+ */
+ if (cp_flags & MM_CP_PROT_NUMA)
+ newprot = PAGE_NONE;
+#else
+ WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
+#endif
+
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot,
cp_flags);
@@ -644,7 +675,7 @@ success:
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
vma_set_page_prot(vma);
- change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags);
+ change_protection(tlb, vma, start, end, mm_cp_flags);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
diff --git a/mm/nommu.c b/mm/nommu.c
index 5b83938ecb67..df1711acdf5b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -892,29 +892,36 @@ static unsigned long determine_vm_flags(struct file *file,
unsigned long vm_flags;
vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
- /* vm_flags |= mm->def_flags; */
- if (!(capabilities & NOMMU_MAP_DIRECT)) {
- /* attempt to share read-only copies of mapped file chunks */
+ if (!file) {
+ /*
+ * MAP_ANONYMOUS. MAP_SHARED is mapped to MAP_PRIVATE, because
+ * there is no fork().
+ */
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
- if (file && !(prot & PROT_WRITE))
- vm_flags |= VM_MAYSHARE;
+ } else if (flags & MAP_PRIVATE) {
+ /* MAP_PRIVATE file mapping */
+ if (capabilities & NOMMU_MAP_DIRECT)
+ vm_flags |= (capabilities & NOMMU_VMFLAGS);
+ else
+ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+ if (!(prot & PROT_WRITE) && !current->ptrace)
+ /*
+ * R/O private file mapping which cannot be used to
+ * modify memory, especially also not via active ptrace
+ * (e.g., set breakpoints) or later by upgrading
+ * permissions (no mprotect()). We can try overlaying
+ * the file mapping, which will work e.g., on chardevs,
+ * ramfs/tmpfs/shmfs and romfs/cramf.
+ */
+ vm_flags |= VM_MAYOVERLAY;
} else {
- /* overlay a shareable mapping on the backing device or inode
- * if possible - used for chardevs, ramfs/tmpfs/shmfs and
- * romfs/cramfs */
- vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
- if (flags & MAP_SHARED)
- vm_flags |= VM_SHARED;
+ /* MAP_SHARED file mapping: NOMMU_MAP_DIRECT is set. */
+ vm_flags |= VM_SHARED | VM_MAYSHARE |
+ (capabilities & NOMMU_VMFLAGS);
}
- /* refuse to let anyone share private mappings with this process if
- * it's being traced - otherwise breakpoints set in it may interfere
- * with another untraced process
- */
- if ((flags & MAP_PRIVATE) && current->ptrace)
- vm_flags &= ~VM_MAYSHARE;
-
return vm_flags;
}
@@ -952,15 +959,18 @@ static int do_mmap_private(struct vm_area_struct *vma,
void *base;
int ret, order;
- /* invoke the file's mapping function so that it can keep track of
- * shared mappings on devices or memory
- * - VM_MAYSHARE will be set if it may attempt to share
+ /*
+ * Invoke the file's mapping function so that it can keep track of
+ * shared mappings on devices or memory. VM_MAYOVERLAY will be set if
+ * it may attempt to share, which will make is_nommu_shared_mapping()
+ * happy.
*/
if (capabilities & NOMMU_MAP_DIRECT) {
ret = call_mmap(vma->vm_file, vma);
+ /* shouldn't return success if we're not sharing */
+ if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
+ ret = -ENOSYS;
if (ret == 0) {
- /* shouldn't return success if we're not sharing */
- BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
vma->vm_region->vm_top = vma->vm_region->vm_end;
return 0;
}
@@ -1106,7 +1116,7 @@ unsigned long do_mmap(struct file *file,
* these cases, sharing is handled in the driver or filesystem rather
* than here
*/
- if (vm_flags & VM_MAYSHARE) {
+ if (is_nommu_shared_mapping(vm_flags)) {
struct vm_region *pregion;
unsigned long pglen, rpglen, pgend, rpgend, start;
@@ -1116,7 +1126,7 @@ unsigned long do_mmap(struct file *file,
for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
pregion = rb_entry(rb, struct vm_region, vm_rb);
- if (!(pregion->vm_flags & VM_MAYSHARE))
+ if (!is_nommu_shared_mapping(pregion->vm_flags))
continue;
/* search for overlapping mappings on the same file */
@@ -1600,7 +1610,7 @@ static unsigned long do_mremap(unsigned long addr,
if (vma->vm_end != vma->vm_start + old_len)
return (unsigned long) -EFAULT;
- if (vma->vm_flags & VM_MAYSHARE)
+ if (is_nommu_shared_mapping(vma->vm_flags))
return (unsigned long) -EPERM;
if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ad608ef2a243..e91f94b3438b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2526,12 +2526,8 @@ continue_unlock:
}
EXPORT_SYMBOL(write_cache_pages);
-/*
- * Function used by generic_writepages to call the real writepage
- * function and set the mapping flags on error
- */
-static int __writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+static int writepage_cb(struct page *page, struct writeback_control *wbc,
+ void *data)
{
struct address_space *mapping = data;
int ret = mapping->a_ops->writepage(page, wbc);
@@ -2539,34 +2535,6 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
return ret;
}
-/**
- * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- *
- * Return: %0 on success, negative error code otherwise
- */
-int generic_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct blk_plug plug;
- int ret;
-
- /* deal with chardevs and other special file */
- if (!mapping->a_ops->writepage)
- return 0;
-
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __writepage, mapping);
- blk_finish_plug(&plug);
- return ret;
-}
-
-EXPORT_SYMBOL(generic_writepages);
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2577,11 +2545,20 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages)
+ if (mapping->a_ops->writepages) {
ret = mapping->a_ops->writepages(mapping, wbc);
- else
- ret = generic_writepages(mapping, wbc);
- if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
+ } else if (mapping->a_ops->writepage) {
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, writepage_cb,
+ mapping);
+ blk_finish_plug(&plug);
+ } else {
+ /* deal with chardevs and other special files */
+ ret = 0;
+ }
+ if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
/*
@@ -2713,7 +2690,7 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
*
* The caller must hold lock_page_memcg(). Most callers have the folio
* locked. A few have the folio blocked from truncation through other
- * means (eg zap_page_range() has it mapped and is holding the page table
+ * means (eg zap_vma_pages() has it mapped and is holding the page table
* lock). This can also be called from mark_buffer_dirty(), which I
* cannot prove is always protected against truncate.
*/
@@ -2846,11 +2823,11 @@ bool folio_mark_dirty(struct folio *folio)
if (likely(mapping)) {
/*
- * readahead/lru_deactivate_page could remain
+ * readahead/folio_deactivate could remain
* PG_readahead/PG_reclaim due to race with folio_end_writeback
* About readahead, if the folio is written, the flags would be
* reset. So no problem.
- * About lru_deactivate_page, if the folio is redirtied,
+ * About folio_deactivate, if the folio is redirtied,
* the flag will be reset. So no problem. but if the
* folio is used by readahead it will confuse readahead
* and make it restart the size rampup process. But it's
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0745aedebb37..5514d84cc712 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -443,15 +443,15 @@ static inline bool deferred_pages_enabled(void)
return static_branch_unlikely(&deferred_pages);
}
-/* Returns true if the struct page for the pfn is uninitialised */
-static inline bool __meminit early_page_uninitialised(unsigned long pfn)
+/* Returns true if the struct page for the pfn is initialised */
+static inline bool __meminit early_page_initialised(unsigned long pfn)
{
int nid = early_pfn_to_nid(pfn);
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
- return true;
+ return false;
- return false;
+ return true;
}
/*
@@ -498,9 +498,9 @@ static inline bool deferred_pages_enabled(void)
return false;
}
-static inline bool early_page_uninitialised(unsigned long pfn)
+static inline bool early_page_initialised(unsigned long pfn)
{
- return false;
+ return true;
}
static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
@@ -1356,6 +1356,8 @@ out:
* see the comment next to it.
* 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
* see the comment next to it.
+ * 4. The allocation is excluded from being checked due to sampling,
+ * see the call to kasan_unpoison_pages.
*
* Poisoning pages during deferred memory init will greatly lengthen the
* process and cause problem in large memory systems as the deferred pages
@@ -1641,7 +1643,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
pg_data_t *pgdat;
int nid, zid;
- if (!early_page_uninitialised(pfn))
+ if (early_page_initialised(pfn))
return;
nid = early_pfn_to_nid(pfn);
@@ -1804,7 +1806,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
- if (early_page_uninitialised(pfn))
+ if (!early_page_initialised(pfn))
return;
if (!kmsan_memblock_free_pages(page, order)) {
/* KMSAN will take care of these pages. */
@@ -2468,7 +2470,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
{
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
!should_skip_init(gfp_flags);
- bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+ bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+ bool reset_tags = !zero_tags;
int i;
set_page_private(page, 0);
@@ -2491,30 +2494,42 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
*/
/*
- * If memory tags should be zeroed (which happens only when memory
- * should be initialized as well).
+ * If memory tags should be zeroed
+ * (which happens only when memory should be initialized as well).
*/
- if (init_tags) {
+ if (zero_tags) {
/* Initialize both memory and tags. */
for (i = 0; i != 1 << order; ++i)
tag_clear_highpage(page + i);
- /* Note that memory is already initialized by the loop above. */
+ /* Take note that memory was initialized by the loop above. */
init = false;
}
if (!should_skip_kasan_unpoison(gfp_flags)) {
- /* Unpoison shadow memory or set memory tags. */
- kasan_unpoison_pages(page, order, init);
-
- /* Note that memory is already initialized by KASAN. */
- if (kasan_has_integrated_init())
- init = false;
- } else {
- /* Ensure page_address() dereferencing does not fault. */
+ /* Try unpoisoning (or setting tags) and initializing memory. */
+ if (kasan_unpoison_pages(page, order, init)) {
+ /* Take note that memory was initialized by KASAN. */
+ if (kasan_has_integrated_init())
+ init = false;
+ /* Take note that memory tags were set by KASAN. */
+ reset_tags = false;
+ } else {
+ /*
+ * KASAN decided to exclude this allocation from being
+ * poisoned due to sampling. Skip poisoning as well.
+ */
+ SetPageSkipKASanPoison(page);
+ }
+ }
+ /*
+ * If memory tags have not been set, reset the page tags to ensure
+ * page_address() dereferencing does not fault.
+ */
+ if (reset_tags) {
for (i = 0; i != 1 << order; ++i)
page_kasan_tag_reset(page + i);
}
- /* If memory is still not initialized, do it now. */
+ /* If memory is still not initialized, initialize it now. */
if (init)
kernel_init_pages(page, 1 << order);
/* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
@@ -7926,6 +7941,7 @@ static void __init free_area_init_node(int nid)
pgdat_set_deferred_range(pgdat);
free_area_init_core(pgdat);
+ lru_gen_init_pgdat(pgdat);
}
static void __init free_area_init_memoryless_node(int nid)
diff --git a/mm/page_idle.c b/mm/page_idle.c
index bc08332a609c..41ea77f22011 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -31,19 +31,22 @@
*
* This function tries to get a user memory page by pfn as described above.
*/
-static struct page *page_idle_get_page(unsigned long pfn)
+static struct folio *page_idle_get_folio(unsigned long pfn)
{
struct page *page = pfn_to_online_page(pfn);
+ struct folio *folio;
- if (!page || !PageLRU(page) ||
- !get_page_unless_zero(page))
+ if (!page || PageTail(page))
return NULL;
- if (unlikely(!PageLRU(page))) {
- put_page(page);
- page = NULL;
+ folio = page_folio(page);
+ if (!folio_test_lru(folio) || !folio_try_get(folio))
+ return NULL;
+ if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) {
+ folio_put(folio);
+ folio = NULL;
}
- return page;
+ return folio;
}
static bool page_idle_clear_pte_refs_one(struct folio *folio,
@@ -83,10 +86,8 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio,
return true;
}
-static void page_idle_clear_pte_refs(struct page *page)
+static void page_idle_clear_pte_refs(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
/*
* Since rwc.try_lock is unused, rwc is effectively immutable, so we
* can make it static to save some cycles and stack.
@@ -115,7 +116,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
loff_t pos, size_t count)
{
u64 *out = (u64 *)buf;
- struct page *page;
+ struct folio *folio;
unsigned long pfn, end_pfn;
int bit;
@@ -134,19 +135,19 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
bit = pfn % BITMAP_CHUNK_BITS;
if (!bit)
*out = 0ULL;
- page = page_idle_get_page(pfn);
- if (page) {
- if (page_is_idle(page)) {
+ folio = page_idle_get_folio(pfn);
+ if (folio) {
+ if (folio_test_idle(folio)) {
/*
* The page might have been referenced via a
* pte, in which case it is not idle. Clear
* refs and recheck.
*/
- page_idle_clear_pte_refs(page);
- if (page_is_idle(page))
+ page_idle_clear_pte_refs(folio);
+ if (folio_test_idle(folio))
*out |= 1ULL << bit;
}
- put_page(page);
+ folio_put(folio);
}
if (bit == BITMAP_CHUNK_BITS - 1)
out++;
@@ -160,7 +161,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
loff_t pos, size_t count)
{
const u64 *in = (u64 *)buf;
- struct page *page;
+ struct folio *folio;
unsigned long pfn, end_pfn;
int bit;
@@ -178,11 +179,11 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
if ((*in >> bit) & 1) {
- page = page_idle_get_page(pfn);
- if (page) {
- page_idle_clear_pte_refs(page);
- set_page_idle(page);
- put_page(page);
+ folio = page_idle_get_folio(pfn);
+ if (folio) {
+ page_idle_clear_pte_refs(folio);
+ folio_set_idle(folio);
+ folio_put(folio);
}
}
if (bit == BITMAP_CHUNK_BITS - 1)
diff --git a/mm/page_io.c b/mm/page_io.c
index 3a5f921b932e..905d9fcc0c96 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,7 +18,6 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
-#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
#include <linux/blkdev.h>
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 79a8554f024c..c65813a9dc78 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -356,7 +356,8 @@ int page_reporting_register(struct page_reporting_dev_info *prdev)
mutex_lock(&page_reporting_mutex);
/* nothing to do if already in use */
- if (rcu_access_pointer(pr_dev_info)) {
+ if (rcu_dereference_protected(pr_dev_info,
+ lockdep_is_held(&page_reporting_mutex))) {
err = -EBUSY;
goto err_out;
}
@@ -401,7 +402,8 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev)
{
mutex_lock(&page_reporting_mutex);
- if (rcu_access_pointer(pr_dev_info) == prdev) {
+ if (prdev == rcu_dereference_protected(pr_dev_info,
+ lockdep_is_held(&page_reporting_mutex))) {
/* Disable page reporting notification */
RCU_INIT_POINTER(pr_dev_info, NULL);
synchronize_rcu();
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 93e13fc17d3c..4e448cfbc6ef 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -168,9 +168,12 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
/* The only possible mapping was handled on last iteration */
if (pvmw->pte)
return not_found(pvmw);
-
- /* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
+ /*
+ * All callers that get here will already hold the
+ * i_mmap_rwsem. Therefore, no additional locks need to be
+ * taken before calling hugetlb_walk().
+ */
+ pvmw->pte = hugetlb_walk(vma, pvmw->address, size);
if (!pvmw->pte)
return false;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7f1c9b274906..cb23f8a15c13 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -302,18 +302,18 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ hugetlb_vma_lock_read(vma);
do {
next = hugetlb_entry_end(h, addr, end);
- pte = huge_pte_offset(walk->mm, addr & hmask, sz);
-
+ pte = hugetlb_walk(vma, addr & hmask, sz);
if (pte)
err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
else if (ops->pte_hole)
err = ops->pte_hole(addr, next, -1, walk);
-
if (err)
break;
} while (addr = next, addr != end);
+ hugetlb_vma_unlock_read(vma);
return err;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index b616870a09be..ab74e0547a52 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio,
}
if (pvmw.pte) {
- if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
- !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
+ if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
lru_gen_look_around(&pvmw);
referenced++;
}
if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- /*
- * Don't treat a reference through
- * a sequentially read mapping as such.
- * If the folio has been used in another mapping,
- * we will catch it; if this other mapping is
- * already gone, the unmap path will have set
- * the referenced flag or activated the folio.
- */
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
- referenced++;
- }
+ pvmw.pte))
+ referenced++;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_flush_young_notify(vma, address,
pvmw.pmd))
@@ -875,7 +864,20 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
struct folio_referenced_arg *pra = arg;
struct mem_cgroup *memcg = pra->memcg;
- if (!mm_match_cgroup(vma->vm_mm, memcg))
+ /*
+ * Ignore references from this mapping if it has no recency. If the
+ * folio has been used in another mapping, we will catch it; if this
+ * other mapping is already gone, the unmap path will have set the
+ * referenced flag or activated the folio in zap_pte_range().
+ */
+ if (!vma_has_recency(vma))
+ return true;
+
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip counting on behalf
+ * of references from different cgroups.
+ */
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
return true;
return false;
@@ -906,6 +908,7 @@ int folio_referenced(struct folio *folio, int is_locked,
.arg = (void *)&pra,
.anon_lock = folio_lock_anon_vma_read,
.try_lock = true,
+ .invalid_vma = invalid_folio_referenced_vma,
};
*vm_flags = 0;
@@ -921,15 +924,6 @@ int folio_referenced(struct folio *folio, int is_locked,
return 1;
}
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg) {
- rwc.invalid_vma = invalid_folio_referenced_vma;
- }
-
rmap_walk(folio, &rwc);
*vm_flags = pra.vm_flags;
@@ -1222,9 +1216,6 @@ void page_add_anon_rmap(struct page *page,
bool compound = flags & RMAP_COMPOUND;
bool first = true;
- if (unlikely(PageKsm(page)))
- lock_page_memcg(page);
-
/* Is page being mapped by PTE? Is this its first map to be added? */
if (likely(!compound)) {
first = atomic_inc_and_test(&page->_mapcount);
@@ -1262,15 +1253,14 @@ void page_add_anon_rmap(struct page *page,
if (nr)
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
- if (unlikely(PageKsm(page)))
- unlock_page_memcg(page);
-
- /* address might be in next vma when migration races vma_adjust */
- else if (first)
- __page_set_anon_rmap(page, vma, address,
- !!(flags & RMAP_EXCLUSIVE));
- else
- __page_check_anon_rmap(page, vma, address);
+ if (likely(!PageKsm(page))) {
+ /* address might be in next vma when migration races vma_adjust */
+ if (first)
+ __page_set_anon_rmap(page, vma, address,
+ !!(flags & RMAP_EXCLUSIVE));
+ else
+ __page_check_anon_rmap(page, vma, address);
+ }
mlock_vma_page(page, vma, compound);
}
@@ -1329,7 +1319,6 @@ void page_add_file_rmap(struct page *page,
bool first;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
- lock_page_memcg(page);
/* Is page being mapped by PTE? Is this its first map to be added? */
if (likely(!compound)) {
@@ -1365,7 +1354,6 @@ void page_add_file_rmap(struct page *page,
NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
if (nr)
__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
- unlock_page_memcg(page);
mlock_vma_page(page, vma, compound);
}
@@ -1394,8 +1382,6 @@ void page_remove_rmap(struct page *page,
return;
}
- lock_page_memcg(page);
-
/* Is page being unmapped by PTE? Is this its last map to be removed? */
if (likely(!compound)) {
last = atomic_add_negative(-1, &page->_mapcount);
@@ -1451,8 +1437,6 @@ void page_remove_rmap(struct page *page,
* and remember that it's only reliable while mapped.
*/
- unlock_page_memcg(page);
-
munlock_vma_page(page, vma, compound);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 0005ab2c29af..bc5c156ef470 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1093,6 +1093,12 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
if (error)
return error;
+ if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
+ if ((inode->i_mode ^ attr->ia_mode) & 0111) {
+ return -EPERM;
+ }
+ }
+
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
@@ -1733,6 +1739,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
+ struct swap_info_struct *si;
struct folio *folio = NULL;
swp_entry_t swap;
int error;
@@ -1744,6 +1751,14 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (is_swapin_error_entry(swap))
return -EIO;
+ si = get_swap_device(swap);
+ if (!si) {
+ if (!shmem_confirm_swap(mapping, index, swap))
+ return -EEXIST;
+ else
+ return -EINVAL;
+ }
+
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
if (!folio) {
@@ -1804,6 +1819,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
delete_from_swap_cache(folio);
folio_mark_dirty(folio);
swap_free(swap);
+ put_swap_device(si);
*foliop = folio;
return 0;
@@ -1817,6 +1833,7 @@ unlock:
folio_unlock(folio);
folio_put(folio);
}
+ put_swap_device(si);
return error;
}
diff --git a/mm/slab.c b/mm/slab.c
index 7a269db050ee..b77be9c6d6b1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1373,7 +1373,7 @@ static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
/* Make the flag visible before any changes to folio->mapping */
smp_wmb();
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0)))
+ if (sk_memalloc_socks() && folio_is_pfmemalloc(folio))
slab_set_pfmemalloc(slab);
return slab;
diff --git a/mm/slab.h b/mm/slab.h
index 7cc432969945..63fb4c00d529 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -323,6 +323,14 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
}
#endif
+static inline bool is_kmalloc_cache(struct kmem_cache *s)
+{
+#ifndef CONFIG_SLOB
+ return (s->flags & SLAB_KMALLOC);
+#else
+ return false;
+#endif
+}
/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1cba98acc486..bf4e777cfe90 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -670,7 +670,6 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset,
usersize);
- kasan_cache_create_kmalloc(s);
list_add(&s->list, &slab_caches);
s->refcount = 1;
return s;
diff --git a/mm/slub.c b/mm/slub.c
index 13459c69095a..67020074ecb4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1859,7 +1859,7 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node,
__folio_set_slab(folio);
/* Make the flag visible before any changes to folio->mapping */
smp_wmb();
- if (page_is_pfmemalloc(folio_page(folio, 0)))
+ if (folio_is_pfmemalloc(folio))
slab_set_pfmemalloc(slab);
return slab;
diff --git a/mm/swap.c b/mm/swap.c
index 70e2063ef43a..e54e2a252e27 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -733,17 +733,15 @@ void deactivate_file_folio(struct folio *folio)
}
/*
- * deactivate_page - deactivate a page
- * @page: page to deactivate
+ * folio_deactivate - deactivate a folio
+ * @folio: folio to deactivate
*
- * deactivate_page() moves @page to the inactive list if @page was on the active
- * list and was not an unevictable page. This is done to accelerate the reclaim
- * of @page.
+ * folio_deactivate() moves @folio to the inactive list if @folio was on the
+ * active list and was not unevictable. This is done to accelerate the
+ * reclaim of @folio.
*/
-void deactivate_page(struct page *page)
+void folio_deactivate(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
(folio_test_active(folio) || lru_gen_enabled())) {
struct folio_batch *fbatch;
@@ -757,16 +755,14 @@ void deactivate_page(struct page *page)
}
/**
- * mark_page_lazyfree - make an anon page lazyfree
- * @page: page to deactivate
+ * folio_mark_lazyfree - make an anon folio lazyfree
+ * @folio: folio to deactivate
*
- * mark_page_lazyfree() moves @page to the inactive file list.
- * This is done to accelerate the reclaim of @page.
+ * folio_mark_lazyfree() moves @folio to the inactive file list.
+ * This is done to accelerate the reclaim of @folio.
*/
-void mark_page_lazyfree(struct page *page)
+void folio_mark_lazyfree(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
if (folio_test_lru(folio) && folio_test_anon(folio) &&
folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
!folio_test_unevictable(folio)) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2927507b43d8..cb9aaa00951d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -321,19 +321,15 @@ static inline bool swap_use_vma_readahead(void)
* unlocked and with its refcount incremented - we rely on the kernel
* lock getting page table operations atomic even if we drop the folio
* lock before returning.
+ *
+ * Caller must lock the swap device or hold a reference to keep it valid.
*/
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
struct folio *folio;
- struct swap_info_struct *si;
- si = get_swap_device(entry);
- if (!si)
- return NULL;
folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
- put_swap_device(si);
-
if (folio) {
bool vma_ra = swap_use_vma_readahead();
bool readahead;
@@ -693,28 +689,15 @@ void exit_swap_address_space(unsigned int type)
swapper_spaces[type] = NULL;
}
-static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
- unsigned long faddr,
- unsigned long lpfn,
- unsigned long rpfn,
- unsigned long *start,
- unsigned long *end)
-{
- *start = max3(lpfn, PFN_DOWN(vma->vm_start),
- PFN_DOWN(faddr & PMD_MASK));
- *end = min3(rpfn, PFN_DOWN(vma->vm_end),
- PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
-}
-
static void swap_ra_info(struct vm_fault *vmf,
- struct vma_swap_readahead *ra_info)
+ struct vma_swap_readahead *ra_info)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long ra_val;
- unsigned long faddr, pfn, fpfn;
+ unsigned long faddr, pfn, fpfn, lpfn, rpfn;
unsigned long start, end;
pte_t *pte, *orig_pte;
- unsigned int max_win, hits, prev_win, win, left;
+ unsigned int max_win, hits, prev_win, win;
#ifndef CONFIG_64BIT
pte_t *tpte;
#endif
@@ -727,8 +710,6 @@ static void swap_ra_info(struct vm_fault *vmf,
}
faddr = vmf->address;
- orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
-
fpfn = PFN_DOWN(faddr);
ra_val = GET_SWAP_RA_VAL(vma);
pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
@@ -739,22 +720,28 @@ static void swap_ra_info(struct vm_fault *vmf,
atomic_long_set(&vma->swap_readahead_info,
SWAP_RA_VAL(faddr, win, 0));
- if (win == 1) {
- pte_unmap(orig_pte);
+ if (win == 1)
return;
- }
/* Copy the PTEs because the page table may be unmapped */
- if (fpfn == pfn + 1)
- swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
- else if (pfn == fpfn + 1)
- swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
- &start, &end);
- else {
- left = (win - 1) / 2;
- swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
- &start, &end);
+ orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
+ if (fpfn == pfn + 1) {
+ lpfn = fpfn;
+ rpfn = fpfn + win;
+ } else if (pfn == fpfn + 1) {
+ lpfn = fpfn - win + 1;
+ rpfn = fpfn + 1;
+ } else {
+ unsigned int left = (win - 1) / 2;
+
+ lpfn = fpfn - left;
+ rpfn = fpfn + win - left;
}
+ start = max3(lpfn, PFN_DOWN(vma->vm_start),
+ PFN_DOWN(faddr & PMD_MASK));
+ end = min3(rpfn, PFN_DOWN(vma->vm_end),
+ PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+
ra_info->nr_pte = end - start;
ra_info->offset = fpfn - start;
pte -= ra_info->offset;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4fa440e87cd6..af151679d13a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1836,13 +1836,13 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_t *pte;
struct swap_info_struct *si;
int ret = 0;
- volatile unsigned char *swap_map;
si = swap_info[type];
pte = pte_offset_map(pmd, addr);
do {
struct folio *folio;
unsigned long offset;
+ unsigned char swp_count;
if (!is_swap_pte(*pte))
continue;
@@ -1853,7 +1853,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
offset = swp_offset(entry);
pte_unmap(pte);
- swap_map = &si->swap_map[offset];
folio = swap_cache_get_folio(entry, vma, addr);
if (!folio) {
struct page *page;
@@ -1870,8 +1869,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
folio = page_folio(page);
}
if (!folio) {
- if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
+ swp_count = READ_ONCE(si->swap_map[offset]);
+ if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
goto try_next;
+
return -ENOMEM;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0499907b6f1a..53c3d916ff66 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -74,24 +74,10 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
_dst_pte = pte_mkdirty(_dst_pte);
if (page_in_cache && !vm_shared)
writable = false;
-
- /*
- * Always mark a PTE as write-protected when needed, regardless of
- * VM_WRITE, which the user might change.
- */
- if (wp_copy) {
- _dst_pte = pte_mkuffd_wp(_dst_pte);
- writable = false;
- }
-
if (writable)
_dst_pte = pte_mkwrite(_dst_pte);
- else
- /*
- * We need this to make sure write bit removed; as mk_pte()
- * could return a pte with write bit set.
- */
- _dst_pte = pte_wrprotect(_dst_pte);
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
@@ -724,21 +710,31 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
mmap_changing, 0);
}
-void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
+long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
unsigned long start, unsigned long len, bool enable_wp)
{
+ unsigned int mm_cp_flags;
struct mmu_gather tlb;
- pgprot_t newprot;
+ long ret;
if (enable_wp)
- newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+ mm_cp_flags = MM_CP_UFFD_WP;
else
- newprot = vm_get_page_prot(dst_vma->vm_flags);
+ mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
+ /*
+ * vma->vm_page_prot already reflects that uffd-wp is enabled for this
+ * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
+ * to be write-protected as default whenever protection changes.
+ * Try upgrading write permissions manually.
+ */
+ if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
+ mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
tlb_gather_mmu(&tlb, dst_mm);
- change_protection(&tlb, dst_vma, start, start + len, newprot,
- enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+ ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
tlb_finish_mmu(&tlb);
+
+ return ret;
}
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
@@ -747,7 +743,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
{
struct vm_area_struct *dst_vma;
unsigned long page_mask;
- int err;
+ long err;
/*
* Sanitize the command parameters:
@@ -786,9 +782,12 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
goto out_unlock;
}
- uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+ err = uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+
+ /* Return 0 on success, <0 on failures */
+ if (err > 0)
+ err = 0;
- err = 0;
out_unlock:
mmap_read_unlock(dst_mm);
return err;
diff --git a/mm/util.c b/mm/util.c
index b56c92fb910f..cec9327b27b4 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -120,7 +120,8 @@ EXPORT_SYMBOL(kstrndup);
* @len: memory region length
* @gfp: GFP mask to use
*
- * Return: newly allocated copy of @src or %NULL in case of error
+ * Return: newly allocated copy of @src or %NULL in case of error,
+ * result is physically contiguous. Use kfree() to free.
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
@@ -134,6 +135,27 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
EXPORT_SYMBOL(kmemdup);
/**
+ * kvmemdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ * @gfp: GFP mask to use
+ *
+ * Return: newly allocated copy of @src or %NULL in case of error,
+ * result may be not physically contiguous. Use kvfree() to free.
+ */
+void *kvmemdup(const void *src, size_t len, gfp_t gfp)
+{
+ void *p;
+
+ p = kvmalloc(len, gfp);
+ if (p)
+ memcpy(p, src, len);
+ return p;
+}
+EXPORT_SYMBOL(kvmemdup);
+
+/**
* kmemdup_nul - Create a NUL-terminated string from unterminated data
* @s: The data to stringify
* @len: The size of the data
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ca71de7c9d77..428e0bee5c9c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1815,9 +1815,9 @@ static void drain_vmap_area_work(struct work_struct *work)
}
/*
- * Free a vmap area, caller ensuring that the area has been unmapped
- * and flush_cache_vunmap had been called for the correct range
- * previously.
+ * Free a vmap area, caller ensuring that the area has been unmapped,
+ * unlinked and flush_cache_vunmap had been called for the correct
+ * range previously.
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
@@ -1825,9 +1825,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
unsigned long va_start = va->va_start;
unsigned long nr_lazy;
- spin_lock(&vmap_area_lock);
- unlink_va(va, &vmap_area_root);
- spin_unlock(&vmap_area_lock);
+ if (WARN_ON_ONCE(!list_empty(&va->list)))
+ return;
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
PAGE_SHIFT, &vmap_lazy_nr);
@@ -1871,6 +1870,19 @@ struct vmap_area *find_vmap_area(unsigned long addr)
return va;
}
+static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
+{
+ struct vmap_area *va;
+
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area(addr, &vmap_area_root);
+ if (va)
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ return va;
+}
+
/*** Per cpu kva allocator ***/
/*
@@ -2015,6 +2027,10 @@ static void free_vmap_block(struct vmap_block *vb)
tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
+ spin_lock(&vmap_area_lock);
+ unlink_va(vb->va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
free_vmap_area_noflush(vb->va);
kfree_rcu(vb, rcu_head);
}
@@ -2236,8 +2252,10 @@ void vm_unmap_ram(const void *mem, unsigned int count)
return;
}
- va = find_vmap_area(addr);
- BUG_ON(!va);
+ va = find_unlink_vmap_area(addr);
+ if (WARN_ON_ONCE(!va))
+ return;
+
debug_check_no_locks_freed((void *)va->va_start,
(va->va_end - va->va_start));
free_unmap_vmap_area(va);
@@ -2591,6 +2609,20 @@ struct vm_struct *find_vm_area(const void *addr)
return va->vm;
}
+static struct vm_struct *__remove_vm_area(struct vmap_area *va)
+{
+ struct vm_struct *vm;
+
+ if (!va || !va->vm)
+ return NULL;
+
+ vm = va->vm;
+ kasan_free_module_shadow(vm);
+ free_unmap_vmap_area(va);
+
+ return vm;
+}
+
/**
* remove_vm_area - find and remove a continuous kernel virtual area
* @addr: base address
@@ -2603,26 +2635,10 @@ struct vm_struct *find_vm_area(const void *addr)
*/
struct vm_struct *remove_vm_area(const void *addr)
{
- struct vmap_area *va;
-
might_sleep();
- spin_lock(&vmap_area_lock);
- va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
- if (va && va->vm) {
- struct vm_struct *vm = va->vm;
-
- va->vm = NULL;
- spin_unlock(&vmap_area_lock);
-
- kasan_free_module_shadow(vm);
- free_unmap_vmap_area(va);
-
- return vm;
- }
-
- spin_unlock(&vmap_area_lock);
- return NULL;
+ return __remove_vm_area(
+ find_unlink_vmap_area((unsigned long) addr));
}
static inline void set_area_direct_map(const struct vm_struct *area,
@@ -2636,16 +2652,17 @@ static inline void set_area_direct_map(const struct vm_struct *area,
set_direct_map(area->pages[i]);
}
-/* Handle removing and resetting vm mappings related to the vm_struct. */
-static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
+/* Handle removing and resetting vm mappings related to the VA's vm_struct. */
+static void va_remove_mappings(struct vmap_area *va, int deallocate_pages)
{
+ struct vm_struct *area = va->vm;
unsigned long start = ULONG_MAX, end = 0;
unsigned int page_order = vm_area_page_order(area);
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int flush_dmap = 0;
int i;
- remove_vm_area(area->addr);
+ __remove_vm_area(va);
/* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
if (!flush_reset)
@@ -2690,6 +2707,7 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
+ struct vmap_area *va;
if (!addr)
return;
@@ -2698,19 +2716,20 @@ static void __vunmap(const void *addr, int deallocate_pages)
addr))
return;
- area = find_vm_area(addr);
- if (unlikely(!area)) {
+ va = find_unlink_vmap_area((unsigned long)addr);
+ if (unlikely(!va)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
return;
}
+ area = va->vm;
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
- vm_remove_mappings(area, deallocate_pages);
+ va_remove_mappings(va, deallocate_pages);
if (deallocate_pages) {
int i;
@@ -3031,7 +3050,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
int ret;
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
- gfp_mask |= __GFP_NOWARN;
+
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf3eedf0209c..394ff4962cbc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,6 +55,8 @@
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/khugepaged.h>
+#include <linux/rculist_nulls.h>
+#include <linux/random.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -135,12 +137,6 @@ struct scan_control {
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
-#ifdef CONFIG_LRU_GEN
- /* help kswapd make better choices among multiple memcgs */
- unsigned int memcgs_need_aging:1;
- unsigned long last_reclaimed;
-#endif
-
/* Allocation order */
s8 order;
@@ -449,6 +445,11 @@ static bool cgroup_reclaim(struct scan_control *sc)
return sc->target_mem_cgroup;
}
+static bool global_reclaim(struct scan_control *sc)
+{
+ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
+}
+
/**
* writeback_throttling_sane - is the usual dirty throttling mechanism available?
* @sc: scan_control in question
@@ -499,6 +500,11 @@ static bool cgroup_reclaim(struct scan_control *sc)
return false;
}
+static bool global_reclaim(struct scan_control *sc)
+{
+ return true;
+}
+
static bool writeback_throttling_sane(struct scan_control *sc)
{
return true;
@@ -1920,7 +1926,7 @@ retry:
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
- * Similar in principle to deactivate_page()
+ * Similar in principle to folio_deactivate()
* except we already have the folio isolated
* and know it's dirty
*/
@@ -3176,6 +3182,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
+#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
+
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
{
struct pglist_data *pgdat = NODE_DATA(nid);
@@ -3201,6 +3210,9 @@ static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ if (!sc->may_swap)
+ return 0;
+
if (!can_demote(pgdat->node_id, sc) &&
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
return 0;
@@ -3215,7 +3227,7 @@ static int get_nr_gens(struct lruvec *lruvec, int type)
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
{
- /* see the comment on lru_gen_struct */
+ /* see the comment on lru_gen_folio */
return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
@@ -3615,7 +3627,7 @@ struct ctrl_pos {
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
struct ctrl_pos *pos)
{
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
pos->refaulted = lrugen->avg_refaulted[type][tier] +
@@ -3630,7 +3642,7 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
{
int hist, tier;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
@@ -3707,7 +3719,7 @@ static int folio_update_gen(struct folio *folio, int gen)
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
int type = folio_is_file_lru(folio);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
@@ -3752,7 +3764,7 @@ static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
{
int gen, type, zone;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
walk->batched = 0;
@@ -3785,7 +3797,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal
if (is_vm_hugetlb_page(vma))
return true;
- if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
+ if (!vma_has_recency(vma))
+ return true;
+
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
return true;
if (vma == get_gate_vma(vma->vm_mm))
@@ -4230,7 +4245,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_
} while (err == -EAGAIN);
}
-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
{
struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
@@ -4238,7 +4253,7 @@ static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
VM_WARN_ON_ONCE(walk);
walk = &pgdat->mm_walk;
- } else if (!pgdat && !walk) {
+ } else if (!walk && force_alloc) {
VM_WARN_ON_ONCE(current_is_kswapd());
walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
@@ -4266,7 +4281,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
{
int zone;
int remaining = MAX_LRU_BATCH;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
if (type == LRU_GEN_ANON && !can_swap)
@@ -4274,7 +4289,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
/* prevent cold/hot inversion if force_scan is true */
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- struct list_head *head = &lrugen->lists[old_gen][type][zone];
+ struct list_head *head = &lrugen->folios[old_gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
@@ -4285,7 +4300,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
new_gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
+ list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
if (!--remaining)
return false;
@@ -4302,7 +4317,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
{
int gen, type, zone;
bool success = false;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
DEFINE_MIN_SEQ(lruvec);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -4313,7 +4328,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
gen = lru_gen_from_seq(min_seq[type]);
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- if (!list_empty(&lrugen->lists[gen][type][zone]))
+ if (!list_empty(&lrugen->folios[gen][type][zone]))
goto next;
}
@@ -4323,7 +4338,7 @@ next:
;
}
- /* see the comment on lru_gen_struct */
+ /* see the comment on lru_gen_folio */
if (can_swap) {
min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
@@ -4345,7 +4360,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
{
int prev, next;
int type, zone;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
spin_lock_irq(&lruvec->lru_lock);
@@ -4403,7 +4418,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
bool success;
struct lru_gen_mm_walk *walk;
struct mm_struct *mm = NULL;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
@@ -4419,12 +4434,12 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
+ if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
- walk = set_mm_walk(NULL);
+ walk = set_mm_walk(NULL, true);
if (!walk) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
@@ -4447,8 +4462,7 @@ done:
if (sc->priority <= DEF_PRIORITY - 2)
wait_event_killable(lruvec->mm_state.wait,
max_seq < READ_ONCE(lrugen->max_seq));
-
- return max_seq < READ_ONCE(lrugen->max_seq);
+ return false;
}
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
@@ -4461,97 +4475,52 @@ done:
return true;
}
-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
{
int gen, type, zone;
- unsigned long old = 0;
- unsigned long young = 0;
unsigned long total = 0;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ bool can_swap = get_swappiness(lruvec, sc);
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
for (type = !can_swap; type < ANON_AND_FILE; type++) {
unsigned long seq;
for (seq = min_seq[type]; seq <= max_seq; seq++) {
- unsigned long size = 0;
-
gen = lru_gen_from_seq(seq);
for (zone = 0; zone < MAX_NR_ZONES; zone++)
- size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
-
- total += size;
- if (seq == max_seq)
- young += size;
- else if (seq + MIN_NR_GENS == max_seq)
- old += size;
+ total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
}
}
- /* try to scrape all its memory if this memcg was deleted */
- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
-
- /*
- * The aging tries to be lazy to reduce the overhead, while the eviction
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
- * ideal number of generations is MIN_NR_GENS+1.
- */
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
- return true;
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
- return false;
-
- /*
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
- * of the total number of pages for each generation. A reasonable range
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
- * aging cares about the upper bound of hot pages, while the eviction
- * cares about the lower bound of cold pages.
- */
- if (young * MIN_NR_GENS > total)
- return true;
- if (old * (MIN_NR_GENS + 2) < total)
- return true;
-
- return false;
+ /* whether the size is big enough to be helpful */
+ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
}
-static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
+static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
+ unsigned long min_ttl)
{
- bool need_aging;
- unsigned long nr_to_scan;
- int swappiness = get_swappiness(lruvec, sc);
+ int gen;
+ unsigned long birth;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
- DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
+ /* see the comment on lru_gen_folio */
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
- mem_cgroup_calculate_protection(NULL, memcg);
-
- if (mem_cgroup_below_min(NULL, memcg))
+ if (time_is_after_jiffies(birth + min_ttl))
return false;
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
-
- if (min_ttl) {
- int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
- unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
-
- if (time_is_after_jiffies(birth + min_ttl))
- return false;
-
- /* the size is likely too small to be helpful */
- if (!nr_to_scan && sc->priority != DEF_PRIORITY)
- return false;
- }
+ if (!lruvec_is_sizable(lruvec, sc))
+ return false;
- if (need_aging)
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
+ mem_cgroup_calculate_protection(NULL, memcg);
- return true;
+ return !mem_cgroup_below_min(NULL, memcg);
}
/* to protect the working set of the last N jiffies */
@@ -4560,46 +4529,30 @@ static unsigned long lru_gen_min_ttl __read_mostly;
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
- bool success = false;
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
VM_WARN_ON_ONCE(!current_is_kswapd());
- sc->last_reclaimed = sc->nr_reclaimed;
-
- /*
- * To reduce the chance of going into the aging path, which can be
- * costly, optimistically skip it if the flag below was cleared in the
- * eviction path. This improves the overall performance when multiple
- * memcgs are available.
- */
- if (!sc->memcgs_need_aging) {
- sc->memcgs_need_aging = true;
+ /* check the order to exclude compaction-induced reclaim */
+ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
return;
- }
-
- set_mm_walk(pgdat);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- if (age_lruvec(lruvec, sc, min_ttl))
- success = true;
+ if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
+ mem_cgroup_iter_break(NULL, memcg);
+ return;
+ }
cond_resched();
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
- clear_mm_walk();
-
- /* check the order to exclude compaction-induced reclaim */
- if (success || !min_ttl || sc->order)
- return;
-
/*
* The main goal is to OOM kill if every generation from all memcgs is
* younger than min_ttl. However, another possibility is all memcgs are
- * either below min or empty.
+ * either too small or below min.
*/
if (mutex_trylock(&oom_lock)) {
struct oom_control oc = {
@@ -4753,7 +4706,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
int tier = lru_tier_from_refs(refs);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
@@ -4778,7 +4731,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
/* promoted */
if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
- list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
@@ -4787,7 +4740,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
gen = folio_inc_gen(lruvec, folio, false);
- list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
lrugen->protected[hist][type][tier - 1] + delta);
@@ -4799,7 +4752,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
if (folio_test_locked(folio) || folio_test_writeback(folio) ||
(type == LRU_GEN_FILE && folio_test_dirty(folio))) {
gen = folio_inc_gen(lruvec, folio, true);
- list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
return true;
}
@@ -4810,12 +4763,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca
{
bool success;
- /* unmapping inhibited */
- if (!sc->may_unmap && folio_mapped(folio))
- return false;
-
/* swapping inhibited */
- if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
+ if (!(sc->gfp_mask & __GFP_IO) &&
(folio_test_dirty(folio) ||
(folio_test_anon(folio) && !folio_test_swapcache(folio))))
return false;
@@ -4853,7 +4802,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
int scanned = 0;
int isolated = 0;
int remaining = MAX_LRU_BATCH;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
VM_WARN_ON_ONCE(!list_empty(list));
@@ -4866,7 +4815,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
for (zone = sc->reclaim_idx; zone >= 0; zone--) {
LIST_HEAD(moved);
int skipped = 0;
- struct list_head *head = &lrugen->lists[gen][type][zone];
+ struct list_head *head = &lrugen->folios[gen][type][zone];
while (!list_empty(head)) {
struct folio *folio = lru_to_folio(head);
@@ -4912,9 +4861,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
__count_vm_events(PGSCAN_ANON + type, isolated);
/*
- * There might not be eligible pages due to reclaim_idx, may_unmap and
- * may_writepage. Check the remaining to prevent livelock if it's not
- * making progress.
+ * There might not be eligible folios due to reclaim_idx. Check the
+ * remaining to prevent livelock if it's not making progress.
*/
return isolated || !remaining ? scanned : 0;
}
@@ -5009,8 +4957,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
return scanned;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
- bool *need_swapping)
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
{
int type;
int scanned;
@@ -5099,162 +5046,399 @@ retry:
goto retry;
}
- if (need_swapping && type == LRU_GEN_ANON)
- *need_swapping = true;
-
return scanned;
}
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
+ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+{
+ int gen, type, zone;
+ unsigned long old = 0;
+ unsigned long young = 0;
+ unsigned long total = 0;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ /* whether this lruvec is completely out of cold folios */
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
+ *nr_to_scan = 0;
+ return true;
+ }
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ unsigned long size = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ total += size;
+ if (seq == max_seq)
+ young += size;
+ else if (seq + MIN_NR_GENS == max_seq)
+ old += size;
+ }
+ }
+
+ /* try to scrape all its memory if this memcg was deleted */
+ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+
+ /*
+ * The aging tries to be lazy to reduce the overhead, while the eviction
+ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
+ * ideal number of generations is MIN_NR_GENS+1.
+ */
+ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
+ return false;
+
+ /*
+ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
+ * of the total number of pages for each generation. A reasonable range
+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
+ * aging cares about the upper bound of hot pages, while the eviction
+ * cares about the lower bound of cold pages.
+ */
+ if (young * MIN_NR_GENS > total)
+ return true;
+ if (old * (MIN_NR_GENS + 2) < total)
+ return true;
+
+ return false;
+}
+
/*
* For future optimizations:
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
* reclaim.
*/
-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- bool can_swap, bool *need_aging)
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
{
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
- DEFINE_MIN_SEQ(lruvec);
- if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
- (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
- !sc->memcg_low_reclaim))
+ if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg))
return 0;
- *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
- if (!*need_aging)
+ if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
return nr_to_scan;
/* skip the aging path at the default priority */
if (sc->priority == DEF_PRIORITY)
- goto done;
+ return nr_to_scan;
- /* leave the work to lru_gen_age_node() */
- if (current_is_kswapd())
- return 0;
+ /* skip this lruvec as it's low on cold folios */
+ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
+}
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
- return nr_to_scan;
-done:
- return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+static unsigned long get_nr_to_reclaim(struct scan_control *sc)
+{
+ /* don't abort memcg reclaim to ensure fairness */
+ if (!global_reclaim(sc))
+ return -1;
+
+ return max(sc->nr_to_reclaim, compact_gap(sc->order));
}
-static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
- struct scan_control *sc, bool need_swapping)
+static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
- int i;
- DEFINE_MAX_SEQ(lruvec);
+ long nr_to_scan;
+ unsigned long scanned = 0;
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+ int swappiness = get_swappiness(lruvec, sc);
- if (!current_is_kswapd()) {
- /* age each memcg at most once to ensure fairness */
- if (max_seq - seq > 1)
- return true;
+ /* clean file folios are more likely to exist */
+ if (swappiness && !(sc->gfp_mask & __GFP_IO))
+ swappiness = 1;
- /* over-swapping can increase allocation latency */
- if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
- return true;
+ while (true) {
+ int delta;
- /* give this thread a chance to exit and free its memory */
- if (fatal_signal_pending(current)) {
- sc->nr_reclaimed += MIN_LRU_BATCH;
- return true;
- }
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+ if (nr_to_scan <= 0)
+ break;
- if (cgroup_reclaim(sc))
- return false;
- } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
- return false;
+ delta = evict_folios(lruvec, sc, swappiness);
+ if (!delta)
+ break;
- /* keep scanning at low priorities to ensure fairness */
- if (sc->priority > DEF_PRIORITY - 2)
- return false;
+ scanned += delta;
+ if (scanned >= nr_to_scan)
+ break;
- /*
- * A minimum amount of work was done under global memory pressure. For
- * kswapd, it may be overshooting. For direct reclaim, the allocation
- * may succeed if all suitable zones are somewhat safe. In either case,
- * it's better to stop now, and restart later if necessary.
- */
- for (i = 0; i <= sc->reclaim_idx; i++) {
- unsigned long wmark;
- struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ break;
- if (!managed_zone(zone))
+ cond_resched();
+ }
+
+ /* whether try_to_inc_max_seq() was successful */
+ return nr_to_scan < 0;
+}
+
+static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
+{
+ bool success;
+ unsigned long scanned = sc->nr_scanned;
+ unsigned long reclaimed = sc->nr_reclaimed;
+ int seg = lru_gen_memcg_seg(lruvec);
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (!lruvec_is_sizable(lruvec, sc))
+ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
+
+ mem_cgroup_calculate_protection(NULL, memcg);
+
+ if (mem_cgroup_below_min(NULL, memcg))
+ return MEMCG_LRU_YOUNG;
+
+ if (mem_cgroup_below_low(NULL, memcg)) {
+ /* see the comment on MEMCG_NR_GENS */
+ if (seg != MEMCG_LRU_TAIL)
+ return MEMCG_LRU_TAIL;
+
+ memcg_memory_event(memcg, MEMCG_LOW);
+ }
+
+ success = try_to_shrink_lruvec(lruvec, sc);
+
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
+
+ if (!sc->proactive)
+ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
+ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
+ current->reclaim_state->reclaimed_slab = 0;
+
+ return success ? MEMCG_LRU_YOUNG : 0;
+}
+
+#ifdef CONFIG_MEMCG
+
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int gen;
+ int bin;
+ int first_bin;
+ struct lruvec *lruvec;
+ struct lru_gen_folio *lrugen;
+ const struct hlist_nulls_node *pos;
+ int op = 0;
+ struct mem_cgroup *memcg = NULL;
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
+
+ bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
+restart:
+ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
+
+ rcu_read_lock();
+
+ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
+ if (op)
+ lru_gen_rotate_memcg(lruvec, op);
+
+ mem_cgroup_put(memcg);
+
+ lruvec = container_of(lrugen, struct lruvec, lrugen);
+ memcg = lruvec_memcg(lruvec);
+
+ if (!mem_cgroup_tryget(memcg)) {
+ op = 0;
+ memcg = NULL;
continue;
+ }
- wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
- if (wmark > zone_page_state(zone, NR_FREE_PAGES))
- return false;
+ rcu_read_unlock();
+
+ op = shrink_one(lruvec, sc);
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ goto success;
+
+ rcu_read_lock();
}
- sc->nr_reclaimed += MIN_LRU_BATCH;
+ rcu_read_unlock();
- return true;
+ /* restart if raced with lru_gen_rotate_memcg() */
+ if (gen != get_nulls_value(pos))
+ goto restart;
+
+ /* try the rest of the bins of the current generation */
+ bin = get_memcg_bin(bin + 1);
+ if (bin != first_bin)
+ goto restart;
+success:
+ if (op)
+ lru_gen_rotate_memcg(lruvec, op);
+
+ mem_cgroup_put(memcg);
}
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
- bool need_aging = false;
- bool need_swapping = false;
- unsigned long scanned = 0;
- unsigned long reclaimed = sc->nr_reclaimed;
- DEFINE_MAX_SEQ(lruvec);
+
+ VM_WARN_ON_ONCE(global_reclaim(sc));
+ VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
lru_add_drain();
blk_start_plug(&plug);
- set_mm_walk(lruvec_pgdat(lruvec));
+ set_mm_walk(NULL, sc->proactive);
- while (true) {
- int delta;
- int swappiness;
- unsigned long nr_to_scan;
+ if (try_to_shrink_lruvec(lruvec, sc))
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
- if (sc->may_swap)
- swappiness = get_swappiness(lruvec, sc);
- else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
- swappiness = 1;
- else
- swappiness = 0;
+ clear_mm_walk();
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
- if (!nr_to_scan)
- goto done;
+ blk_finish_plug(&plug);
+}
- delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
- if (!delta)
- goto done;
+#else /* !CONFIG_MEMCG */
- scanned += delta;
- if (scanned >= nr_to_scan)
- break;
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ BUILD_BUG();
+}
- if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
- break;
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ BUILD_BUG();
+}
- cond_resched();
- }
+#endif
+
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ int priority;
+ unsigned long reclaimable;
+ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
+
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
+ return;
+ /*
+ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
+ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
+ * estimated reclaimed_to_scanned_ratio = inactive / total.
+ */
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (get_swappiness(lruvec, sc))
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ reclaimable /= MEMCG_NR_GENS;
+
+ /* round down reclaimable and round up sc->nr_to_reclaim */
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
+
+ sc->priority = clamp(priority, 0, DEF_PRIORITY);
+}
+
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ struct blk_plug plug;
+ unsigned long reclaimed = sc->nr_reclaimed;
+
+ VM_WARN_ON_ONCE(!global_reclaim(sc));
+
+ /*
+ * Unmapped clean folios are already prioritized. Scanning for more of
+ * them is likely futile and can cause high reclaim latency when there
+ * is a large number of memcgs.
+ */
+ if (!sc->may_writepage || !sc->may_unmap)
+ goto done;
+
+ lru_add_drain();
+
+ blk_start_plug(&plug);
+
+ set_mm_walk(pgdat, sc->proactive);
+
+ set_initial_priority(pgdat, sc);
+
+ if (current_is_kswapd())
+ sc->nr_reclaimed = 0;
+
+ if (mem_cgroup_disabled())
+ shrink_one(&pgdat->__lruvec, sc);
+ else
+ shrink_many(pgdat, sc);
+
+ if (current_is_kswapd())
+ sc->nr_reclaimed += reclaimed;
- /* see the comment in lru_gen_age_node() */
- if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
- sc->memcgs_need_aging = false;
-done:
clear_mm_walk();
blk_finish_plug(&plug);
+done:
+ /* kswapd should never fail */
+ pgdat->kswapd_failures = 0;
}
+#ifdef CONFIG_MEMCG
+void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
+{
+ int seg;
+ int old, new;
+ int bin = get_random_u32_below(MEMCG_NR_BINS);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ spin_lock(&pgdat->memcg_lru.lock);
+
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ seg = 0;
+ new = old = lruvec->lrugen.gen;
+
+ /* see the comment on MEMCG_NR_GENS */
+ if (op == MEMCG_LRU_HEAD)
+ seg = MEMCG_LRU_HEAD;
+ else if (op == MEMCG_LRU_TAIL)
+ seg = MEMCG_LRU_TAIL;
+ else if (op == MEMCG_LRU_OLD)
+ new = get_memcg_gen(pgdat->memcg_lru.seq);
+ else if (op == MEMCG_LRU_YOUNG)
+ new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
+ else
+ VM_WARN_ON_ONCE(true);
+
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
+
+ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
+ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+ else
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
+
+ pgdat->memcg_lru.nr_memcgs[old]--;
+ pgdat->memcg_lru.nr_memcgs[new]++;
+
+ lruvec->lrugen.gen = new;
+ WRITE_ONCE(lruvec->lrugen.seg, seg);
+
+ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
+
+ spin_unlock(&pgdat->memcg_lru.lock);
+}
+#endif
+
/******************************************************************************
* state change
******************************************************************************/
static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
{
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
if (lrugen->enabled) {
enum lru_list lru;
@@ -5267,7 +5451,7 @@ static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
int gen, type, zone;
for_each_gen_type_zone(gen, type, zone) {
- if (!list_empty(&lrugen->lists[gen][type][zone]))
+ if (!list_empty(&lrugen->folios[gen][type][zone]))
return false;
}
}
@@ -5312,7 +5496,7 @@ static bool drain_evictable(struct lruvec *lruvec)
int remaining = MAX_LRU_BATCH;
for_each_gen_type_zone(gen, type, zone) {
- struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
+ struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
while (!list_empty(head)) {
bool success;
@@ -5533,7 +5717,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
int i;
int type, tier;
int hist = lru_hist_from_seq(seq);
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
@@ -5583,7 +5767,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
unsigned long seq;
bool full = !debugfs_real_fops(m->file)->write;
struct lruvec *lruvec = v;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
int nid = lruvec_pgdat(lruvec)->node_id;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
@@ -5680,7 +5864,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
if (sc->nr_reclaimed >= nr_to_reclaim)
return 0;
- if (!evict_folios(lruvec, sc, swappiness, NULL))
+ if (!evict_folios(lruvec, sc, swappiness))
return 0;
cond_resched();
@@ -5701,11 +5885,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
if (!mem_cgroup_disabled()) {
rcu_read_lock();
+
memcg = mem_cgroup_from_id(memcg_id);
-#ifdef CONFIG_MEMCG
- if (memcg && !css_tryget(&memcg->css))
+ if (!mem_cgroup_tryget(memcg))
memcg = NULL;
-#endif
+
rcu_read_unlock();
if (!memcg)
@@ -5765,7 +5949,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
set_task_reclaim_state(current, &sc.reclaim_state);
flags = memalloc_noreclaim_save();
blk_start_plug(&plug);
- if (!set_mm_walk(NULL)) {
+ if (!set_mm_walk(NULL, true)) {
err = -ENOMEM;
goto done;
}
@@ -5837,7 +6021,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
{
int i;
int gen, type, zone;
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled = lru_gen_enabled();
@@ -5846,13 +6030,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
lrugen->timestamps[i] = jiffies;
for_each_gen_type_zone(gen, type, zone)
- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+ INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
lruvec->mm_state.seq = MIN_NR_GENS;
init_waitqueue_head(&lruvec->mm_state.wait);
}
#ifdef CONFIG_MEMCG
+
+void lru_gen_init_pgdat(struct pglist_data *pgdat)
+{
+ int i, j;
+
+ spin_lock_init(&pgdat->memcg_lru.lock);
+
+ for (i = 0; i < MEMCG_NR_GENS; i++) {
+ for (j = 0; j < MEMCG_NR_BINS; j++)
+ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
+ }
+}
+
void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
INIT_LIST_HEAD(&memcg->mm_list.fifo);
@@ -5876,7 +6073,69 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
}
}
}
-#endif
+
+void lru_gen_online_memcg(struct mem_cgroup *memcg)
+{
+ int gen;
+ int nid;
+ int bin = get_random_u32_below(MEMCG_NR_BINS);
+
+ for_each_node(nid) {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ spin_lock(&pgdat->memcg_lru.lock);
+
+ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ gen = get_memcg_gen(pgdat->memcg_lru.seq);
+
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
+ pgdat->memcg_lru.nr_memcgs[gen]++;
+
+ lruvec->lrugen.gen = gen;
+
+ spin_unlock(&pgdat->memcg_lru.lock);
+ }
+}
+
+void lru_gen_offline_memcg(struct mem_cgroup *memcg)
+{
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
+ }
+}
+
+void lru_gen_release_memcg(struct mem_cgroup *memcg)
+{
+ int gen;
+ int nid;
+
+ for_each_node(nid) {
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ spin_lock(&pgdat->memcg_lru.lock);
+
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
+
+ gen = lruvec->lrugen.gen;
+
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
+ pgdat->memcg_lru.nr_memcgs[gen]--;
+
+ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
+
+ spin_unlock(&pgdat->memcg_lru.lock);
+ }
+}
+
+#endif /* CONFIG_MEMCG */
static int __init init_lru_gen(void)
{
@@ -5903,6 +6162,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
{
}
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
#endif /* CONFIG_LRU_GEN */
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
@@ -5916,7 +6179,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
bool proportional_reclaim;
struct blk_plug plug;
- if (lru_gen_enabled()) {
+ if (lru_gen_enabled() && !global_reclaim(sc)) {
lru_gen_shrink_lruvec(lruvec, sc);
return;
}
@@ -6159,6 +6422,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
struct lruvec *target_lruvec;
bool reclaimable = false;
+ if (lru_gen_enabled() && global_reclaim(sc)) {
+ lru_gen_shrink_node(pgdat, sc);
+ return;
+ }
+
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
diff --git a/mm/workingset.c b/mm/workingset.c
index 1a86645b7b3c..f194d13beabb 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct folio *folio)
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
- struct lru_gen_struct *lrugen;
+ struct lru_gen_folio *lrugen;
int type = folio_is_file_lru(folio);
int delta = folio_nr_pages(folio);
int refs = folio_lru_refs(folio);
@@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow)
unsigned long token;
unsigned long min_seq;
struct lruvec *lruvec;
- struct lru_gen_struct *lrugen;
+ struct lru_gen_folio *lrugen;
struct mem_cgroup *memcg;
struct pglist_data *pgdat;
int type = folio_is_file_lru(folio);
@@ -457,6 +457,7 @@ void workingset_refault(struct folio *folio, void *shadow)
*/
nr = folio_nr_pages(folio);
memcg = folio_memcg(folio);
+ pgdat = folio_pgdat(folio);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
@@ -474,7 +475,7 @@ void workingset_refault(struct folio *folio, void *shadow)
workingset_size += lruvec_page_state(eviction_lruvec,
NR_INACTIVE_FILE);
}
- if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_ACTIVE_ANON);
if (file) {
diff --git a/mm/z3fold.c b/mm/z3fold.c
index a4de0c317ac7..0cef845d397b 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1450,7 +1450,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
struct z3fold_header *zhdr;
struct z3fold_pool *pool;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
if (test_bit(PAGE_HEADLESS, &page->private))
@@ -1490,7 +1489,6 @@ static int z3fold_page_migrate(struct page *newpage, struct page *page,
struct z3fold_header *zhdr, *new_zhdr;
struct z3fold_pool *pool;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 702bc3fd687a..9d27d9b00bce 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2056,7 +2056,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
* Page is locked so zspage couldn't be destroyed. For detail, look at
* lock_zspage in free_zspage.
*/
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
zspage = get_zspage(page);
@@ -2088,7 +2087,6 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
if (mode == MIGRATE_SYNC_NO_COPY)
return -EINVAL;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
/* The page is locked, so this pointer must remain valid */
@@ -2153,7 +2151,6 @@ static void zs_page_putback(struct page *page)
{
struct zspage *zspage;
- VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
zspage = get_zspage(page);