summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig105
-rw-r--r--mm/Kconfig.debug6
-rw-r--r--mm/Makefile2
-rw-r--r--mm/damon/core.c129
-rw-r--r--mm/damon/lru_sort.c3
-rw-r--r--mm/damon/ops-common.c40
-rw-r--r--mm/damon/reclaim.c3
-rw-r--r--mm/damon/stat.c3
-rw-r--r--mm/damon/sysfs-schemes.c59
-rw-r--r--mm/damon/sysfs.c53
-rw-r--r--mm/damon/tests/core-kunit.h708
-rw-r--r--mm/damon/tests/sysfs-kunit.h25
-rw-r--r--mm/damon/tests/vaddr-kunit.h26
-rw-r--r--mm/damon/vaddr.c143
-rw-r--r--mm/debug.c4
-rw-r--r--mm/debug_vm_pgtable.c111
-rw-r--r--mm/filemap.c99
-rw-r--r--mm/gup.c12
-rw-r--r--mm/hmm.c45
-rw-r--r--mm/huge_memory.c1208
-rw-r--r--mm/hugetlb.c983
-rw-r--r--mm/hugetlb_internal.h117
-rw-r--r--mm/hugetlb_sysctl.c134
-rw-r--r--mm/hugetlb_sysfs.c502
-rw-r--r--mm/hugetlb_vmemmap.c9
-rw-r--r--mm/internal.h73
-rw-r--r--mm/kasan/common.c3
-rw-r--r--mm/kasan/generic.c5
-rw-r--r--mm/kasan/kasan.h7
-rw-r--r--mm/kasan/shadow.c32
-rw-r--r--mm/kasan/tags.c2
-rw-r--r--mm/kfence/core.c24
-rw-r--r--mm/khugepaged.c209
-rw-r--r--mm/kmsan/core.c2
-rw-r--r--mm/kmsan/shadow.c6
-rw-r--r--mm/ksm.c159
-rw-r--r--mm/madvise.c160
-rw-r--r--mm/mapping_dirty_helpers.c2
-rw-r--r--mm/memcontrol.c70
-rw-r--r--mm/memory-failure.c192
-rw-r--r--mm/memory-tiers.c2
-rw-r--r--mm/memory.c333
-rw-r--r--mm/memory_hotplug.c10
-rw-r--r--mm/mempolicy.c86
-rw-r--r--mm/memremap.c40
-rw-r--r--mm/migrate.c74
-rw-r--r--mm/migrate_device.c629
-rw-r--r--mm/mincore.c25
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap.c37
-rw-r--r--mm/mmap_lock.c49
-rw-r--r--mm/mmu_gather.c2
-rw-r--r--mm/mprotect.c150
-rw-r--r--mm/mremap.c26
-rw-r--r--mm/mseal.c9
-rw-r--r--mm/oom_kill.c1
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/page_alloc.c219
-rw-r--r--mm/page_idle.c15
-rw-r--r--mm/page_owner.c98
-rw-r--r--mm/page_table_check.c33
-rw-r--r--mm/page_vma_mapped.c68
-rw-r--r--mm/pagewalk.c52
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/pgtable-generic.c44
-rw-r--r--mm/pt_reclaim.c3
-rw-r--r--mm/ptdump.c10
-rw-r--r--mm/rmap.c91
-rw-r--r--mm/secretmem.c4
-rw-r--r--mm/shmem.c161
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c31
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/swap.h21
-rw-r--r--mm/swap_state.c15
-rw-r--r--mm/swapfile.c234
-rw-r--r--mm/userfaultfd.c127
-rw-r--r--mm/util.c146
-rw-r--r--mm/vma.c229
-rw-r--r--mm/vma.h140
-rw-r--r--mm/vma_exec.c5
-rw-r--r--mm/vmalloc.c271
-rw-r--r--mm/vmscan.c83
-rw-r--r--mm/vmstat.c53
-rw-r--r--mm/workingset.c2
-rw-r--r--mm/zswap.c7
87 files changed, 5861 insertions, 3261 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ca3f146bc705..bd0ea5454af8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -695,15 +695,6 @@ config PCP_BATCH_SCALE_MAX
config PHYS_ADDR_T_64BIT
def_bool 64BIT
-config BOUNCE
- bool "Enable bounce buffers"
- default y
- depends on BLOCK && MMU && HIGHMEM
- help
- Enable bounce buffers for devices that cannot access the full range of
- memory available to the CPU. Enabled by default when HIGHMEM is
- selected, but you may say n to override this.
-
config MMU_NOTIFIER
bool
select INTERVAL_TREE
@@ -749,7 +740,7 @@ config MEMORY_FAILURE
depends on MMU
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
- select RAS
+ select INTERVAL_TREE
help
Enables code to recover from some memory failures on systems
with MCA recovery. This allows a system to continue running
@@ -862,6 +853,97 @@ choice
enabled at runtime via sysfs.
endchoice
+choice
+ prompt "Shmem hugepage allocation defaults"
+ depends on TRANSPARENT_HUGEPAGE
+ default TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER
+ help
+ Selects the hugepage allocation policy defaults for
+ the internal shmem mount.
+
+ The selection made here can be overridden by using the kernel
+ command line 'transparent_hugepage_shmem=' option.
+
+ config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER
+ bool "never"
+ help
+ Disable hugepage allocation for shmem mount by default. It can
+ still be enabled with the kernel command line
+ 'transparent_hugepage_shmem=' option or at runtime via sysfs
+ knob. Note that madvise(MADV_COLLAPSE) can still cause
+ transparent huge pages to be obtained even if this mode is
+ specified.
+
+ config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS
+ bool "always"
+ help
+ Always attempt to allocate hugepage for shmem mount, can
+ increase the memory footprint of applications without a
+ guaranteed benefit but it will work automatically for all
+ applications.
+
+ config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE
+ bool "within_size"
+ help
+ Enable hugepage allocation for shmem mount if the allocation
+ will be fully within the i_size. This configuration also takes
+ into account any madvise(MADV_HUGEPAGE) hints that may be
+ provided by the applications.
+
+ config TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE
+ bool "advise"
+ help
+ Enable hugepage allocation for the shmem mount exclusively when
+ applications supply the madvise(MADV_HUGEPAGE) hint.
+ This ensures that hugepages are used only in response to explicit
+ requests from applications.
+endchoice
+
+choice
+ prompt "Tmpfs hugepage allocation defaults"
+ depends on TRANSPARENT_HUGEPAGE
+ default TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER
+ help
+ Selects the hugepage allocation policy defaults for
+ the tmpfs mount.
+
+ The selection made here can be overridden by using the kernel
+ command line 'transparent_hugepage_tmpfs=' option.
+
+ config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER
+ bool "never"
+ help
+ Disable hugepage allocation for tmpfs mount by default. It can
+ still be enabled with the kernel command line
+ 'transparent_hugepage_tmpfs=' option. Note that
+ madvise(MADV_COLLAPSE) can still cause transparent huge pages
+ to be obtained even if this mode is specified.
+
+ config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS
+ bool "always"
+ help
+ Always attempt to allocate hugepage for tmpfs mount, can
+ increase the memory footprint of applications without a
+ guaranteed benefit but it will work automatically for all
+ applications.
+
+ config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE
+ bool "within_size"
+ help
+ Enable hugepage allocation for tmpfs mount if the allocation
+ will be fully within the i_size. This configuration also takes
+ into account any madvise(MADV_HUGEPAGE) hints that may be
+ provided by the applications.
+
+ config TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE
+ bool "advise"
+ help
+ Enable hugepage allocation for the tmpfs mount exclusively when
+ applications supply the madvise(MADV_HUGEPAGE) hint.
+ This ensures that hugepages are used only in response to explicit
+ requests from applications.
+endchoice
+
config THP_SWAP
def_bool y
depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT
@@ -915,6 +997,9 @@ config HAVE_GIGANTIC_FOLIOS
def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \
(ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+config ASYNC_KERNEL_PGTABLE_FREE
+ def_bool n
+
# TODO: Allow to be enabled without THP
config ARCH_SUPPORTS_HUGE_PFNMAP
def_bool n
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 32b65073d0cc..7638d75b27db 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -175,10 +175,10 @@ config DEBUG_PAGE_REF
nil until the tracepoints are actually enabled.
config DEBUG_RODATA_TEST
- bool "Testcase for the marking rodata read-only"
- depends on STRICT_KERNEL_RWX
+ bool "Testcase for the marking rodata read-only"
+ depends on STRICT_KERNEL_RWX
help
- This option enables a testcase for the setting rodata read-only.
+ This option enables a testcase for the setting rodata read-only.
config ARCH_HAS_DEBUG_WX
bool
diff --git a/mm/Makefile b/mm/Makefile
index 21abb3353550..00ceb2418b64 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,7 +78,7 @@ endif
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
-obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+obj-$(CONFIG_HUGETLBFS) += hugetlb.o hugetlb_sysfs.o hugetlb_sysctl.o
ifdef CONFIG_CMA
obj-$(CONFIG_HUGETLBFS) += hugetlb_cma.o
endif
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 109b050c795a..f9fc0375890a 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -10,6 +10,7 @@
#include <linux/damon.h>
#include <linux/delay.h>
#include <linux/kthread.h>
+#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/psi.h>
#include <linux/slab.h>
@@ -19,11 +20,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
-#ifdef CONFIG_DAMON_KUNIT_TEST
-#undef DAMON_MIN_REGION
-#define DAMON_MIN_REGION 1
-#endif
-
static DEFINE_MUTEX(damon_lock);
static int nr_running_ctxs;
static bool running_exclusive_ctxs;
@@ -305,7 +301,7 @@ void damos_add_filter(struct damos *s, struct damos_filter *f)
if (damos_filter_for_ops(f->type))
list_add_tail(&f->list, &s->ops_filters);
else
- list_add_tail(&f->list, &s->filters);
+ list_add_tail(&f->list, &s->core_filters);
}
static void damos_del_filter(struct damos_filter *f)
@@ -396,7 +392,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
*/
scheme->next_apply_sis = 0;
scheme->walk_completed = false;
- INIT_LIST_HEAD(&scheme->filters);
+ INIT_LIST_HEAD(&scheme->core_filters);
INIT_LIST_HEAD(&scheme->ops_filters);
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
@@ -449,7 +445,7 @@ void damon_destroy_scheme(struct damos *s)
damos_for_each_quota_goal_safe(g, g_next, &s->quota)
damos_destroy_quota_goal(g);
- damos_for_each_filter_safe(f, next, s)
+ damos_for_each_core_filter_safe(f, next, s)
damos_destroy_filter(f);
damos_for_each_ops_filter_safe(f, next, s)
@@ -478,6 +474,7 @@ struct damon_target *damon_new_target(void)
t->nr_regions = 0;
INIT_LIST_HEAD(&t->regions_list);
INIT_LIST_HEAD(&t->list);
+ t->obsolete = false;
return t;
}
@@ -788,6 +785,11 @@ static void damos_commit_quota_goal_union(
case DAMOS_QUOTA_NODE_MEM_FREE_BP:
dst->nid = src->nid;
break;
+ case DAMOS_QUOTA_NODE_MEMCG_USED_BP:
+ case DAMOS_QUOTA_NODE_MEMCG_FREE_BP:
+ dst->nid = src->nid;
+ dst->memcg_id = src->memcg_id;
+ break;
default:
break;
}
@@ -857,12 +859,12 @@ static int damos_commit_quota(struct damos_quota *dst, struct damos_quota *src)
return 0;
}
-static struct damos_filter *damos_nth_filter(int n, struct damos *s)
+static struct damos_filter *damos_nth_core_filter(int n, struct damos *s)
{
struct damos_filter *filter;
int i = 0;
- damos_for_each_filter(filter, s) {
+ damos_for_each_core_filter(filter, s) {
if (i++ == n)
return filter;
}
@@ -916,15 +918,15 @@ static int damos_commit_core_filters(struct damos *dst, struct damos *src)
struct damos_filter *dst_filter, *next, *src_filter, *new_filter;
int i = 0, j = 0;
- damos_for_each_filter_safe(dst_filter, next, dst) {
- src_filter = damos_nth_filter(i++, src);
+ damos_for_each_core_filter_safe(dst_filter, next, dst) {
+ src_filter = damos_nth_core_filter(i++, src);
if (src_filter)
damos_commit_filter(dst_filter, src_filter);
else
damos_destroy_filter(dst_filter);
}
- damos_for_each_filter_safe(src_filter, next, src) {
+ damos_for_each_core_filter_safe(src_filter, next, src) {
if (j++ < i)
continue;
@@ -988,41 +990,37 @@ static void damos_set_filters_default_reject(struct damos *s)
s->core_filters_default_reject = false;
else
s->core_filters_default_reject =
- damos_filters_default_reject(&s->filters);
+ damos_filters_default_reject(&s->core_filters);
s->ops_filters_default_reject =
damos_filters_default_reject(&s->ops_filters);
}
-static int damos_commit_dests(struct damos *dst, struct damos *src)
+static int damos_commit_dests(struct damos_migrate_dests *dst,
+ struct damos_migrate_dests *src)
{
- struct damos_migrate_dests *dst_dests, *src_dests;
-
- dst_dests = &dst->migrate_dests;
- src_dests = &src->migrate_dests;
-
- if (dst_dests->nr_dests != src_dests->nr_dests) {
- kfree(dst_dests->node_id_arr);
- kfree(dst_dests->weight_arr);
+ if (dst->nr_dests != src->nr_dests) {
+ kfree(dst->node_id_arr);
+ kfree(dst->weight_arr);
- dst_dests->node_id_arr = kmalloc_array(src_dests->nr_dests,
- sizeof(*dst_dests->node_id_arr), GFP_KERNEL);
- if (!dst_dests->node_id_arr) {
- dst_dests->weight_arr = NULL;
+ dst->node_id_arr = kmalloc_array(src->nr_dests,
+ sizeof(*dst->node_id_arr), GFP_KERNEL);
+ if (!dst->node_id_arr) {
+ dst->weight_arr = NULL;
return -ENOMEM;
}
- dst_dests->weight_arr = kmalloc_array(src_dests->nr_dests,
- sizeof(*dst_dests->weight_arr), GFP_KERNEL);
- if (!dst_dests->weight_arr) {
+ dst->weight_arr = kmalloc_array(src->nr_dests,
+ sizeof(*dst->weight_arr), GFP_KERNEL);
+ if (!dst->weight_arr) {
/* ->node_id_arr will be freed by scheme destruction */
return -ENOMEM;
}
}
- dst_dests->nr_dests = src_dests->nr_dests;
- for (int i = 0; i < src_dests->nr_dests; i++) {
- dst_dests->node_id_arr[i] = src_dests->node_id_arr[i];
- dst_dests->weight_arr[i] = src_dests->weight_arr[i];
+ dst->nr_dests = src->nr_dests;
+ for (int i = 0; i < src->nr_dests; i++) {
+ dst->node_id_arr[i] = src->node_id_arr[i];
+ dst->weight_arr[i] = src->weight_arr[i];
}
return 0;
@@ -1069,7 +1067,7 @@ static int damos_commit(struct damos *dst, struct damos *src)
dst->wmarks = src->wmarks;
dst->target_nid = src->target_nid;
- err = damos_commit_dests(dst, src);
+ err = damos_commit_dests(&dst->migrate_dests, &src->migrate_dests);
if (err)
return err;
@@ -1181,7 +1179,11 @@ static int damon_commit_targets(
damon_for_each_target_safe(dst_target, next, dst) {
src_target = damon_nth_target(i++, src);
- if (src_target) {
+ /*
+ * If src target is obsolete, do not commit the parameters to
+ * the dst target, and further remove the dst target.
+ */
+ if (src_target && !src_target->obsolete) {
err = damon_commit_target(
dst_target, damon_target_has_pid(dst),
src_target, damon_target_has_pid(src),
@@ -1204,6 +1206,9 @@ static int damon_commit_targets(
damon_for_each_target_safe(src_target, next, src) {
if (j++ < i)
continue;
+ /* target to remove has no matching dst */
+ if (src_target->obsolete)
+ return -EINVAL;
new_target = damon_new_target();
if (!new_target)
return -ENOMEM;
@@ -1434,7 +1439,7 @@ bool damon_is_running(struct damon_ctx *ctx)
* Ask DAMON worker thread (kdamond) of @ctx to call a function with an
* argument data that respectively passed via &damon_call_control->fn and
* &damon_call_control->data of @control. If &damon_call_control->repeat of
- * @control is set, further wait until the kdamond finishes handling of the
+ * @control is unset, further wait until the kdamond finishes handling of the
* request. Otherwise, return as soon as the request is made.
*
* The kdamond executes the function with the argument in the main loop, just
@@ -1757,7 +1762,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t,
struct damos_filter *filter;
s->core_filters_allowed = false;
- damos_for_each_filter(filter, s) {
+ damos_for_each_core_filter(filter, s) {
if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) {
if (filter->allow)
s->core_filters_allowed = true;
@@ -2035,12 +2040,50 @@ static __kernel_ulong_t damos_get_node_mem_bp(
numerator = i.freeram;
return numerator * 10000 / i.totalram;
}
+
+static unsigned long damos_get_node_memcg_used_bp(
+ struct damos_quota_goal *goal)
+{
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ unsigned long used_pages, numerator;
+ struct sysinfo i;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(goal->memcg_id);
+ rcu_read_unlock();
+ if (!memcg) {
+ if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP)
+ return 0;
+ else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */
+ return 10000;
+ }
+ mem_cgroup_flush_stats(memcg);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(goal->nid));
+ used_pages = lruvec_page_state(lruvec, NR_ACTIVE_ANON);
+ used_pages += lruvec_page_state(lruvec, NR_INACTIVE_ANON);
+ used_pages += lruvec_page_state(lruvec, NR_ACTIVE_FILE);
+ used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE);
+
+ si_meminfo_node(&i, goal->nid);
+ if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP)
+ numerator = used_pages;
+ else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */
+ numerator = i.totalram - used_pages;
+ return numerator * 10000 / i.totalram;
+}
#else
static __kernel_ulong_t damos_get_node_mem_bp(
struct damos_quota_goal *goal)
{
return 0;
}
+
+static unsigned long damos_get_node_memcg_used_bp(
+ struct damos_quota_goal *goal)
+{
+ return 0;
+}
#endif
@@ -2061,6 +2104,10 @@ static void damos_set_quota_goal_current_value(struct damos_quota_goal *goal)
case DAMOS_QUOTA_NODE_MEM_FREE_BP:
goal->current_value = damos_get_node_mem_bp(goal);
break;
+ case DAMOS_QUOTA_NODE_MEMCG_USED_BP:
+ case DAMOS_QUOTA_NODE_MEMCG_FREE_BP:
+ goal->current_value = damos_get_node_memcg_used_bp(goal);
+ break;
default:
break;
}
@@ -2770,6 +2817,7 @@ static bool damon_find_biggest_system_ram(unsigned long *start,
* @t: The monitoring target to set the region.
* @start: The pointer to the start address of the region.
* @end: The pointer to the end address of the region.
+ * @min_sz_region: Minimum region size.
*
* This function sets the region of @t as requested by @start and @end. If the
* values of @start and @end are zero, however, this function finds the biggest
@@ -2780,7 +2828,8 @@ static bool damon_find_biggest_system_ram(unsigned long *start,
* Return: 0 on success, negative error code otherwise.
*/
int damon_set_region_biggest_system_ram_default(struct damon_target *t,
- unsigned long *start, unsigned long *end)
+ unsigned long *start, unsigned long *end,
+ unsigned long min_sz_region)
{
struct damon_addr_range addr_range;
@@ -2793,7 +2842,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t,
addr_range.start = *start;
addr_range.end = *end;
- return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION);
+ return damon_set_regions(t, &addr_range, 1, min_sz_region);
}
/*
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 42b9a656f9de..49b4bc294f4e 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -242,7 +242,8 @@ static int damon_lru_sort_apply_parameters(void)
err = damon_set_region_biggest_system_ram_default(param_target,
&monitor_region_start,
- &monitor_region_end);
+ &monitor_region_end,
+ param_ctx->min_sz_region);
if (err)
goto out;
err = damon_commit_ctx(ctx, param_ctx);
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 998c5180a603..a218d9922234 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -11,7 +11,7 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include "../internal.h"
#include "ops-common.h"
@@ -51,7 +51,7 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr
if (likely(pte_present(pteval)))
pfn = pte_pfn(pteval);
else
- pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ pfn = softleaf_to_pfn(softleaf_from_pte(pteval));
folio = damon_get_folio(pfn);
if (!folio)
@@ -75,12 +75,24 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr
void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- struct folio *folio = damon_get_folio(pmd_pfn(pmdp_get(pmd)));
+ pmd_t pmdval = pmdp_get(pmd);
+ struct folio *folio;
+ bool young = false;
+ unsigned long pfn;
+ if (likely(pmd_present(pmdval)))
+ pfn = pmd_pfn(pmdval);
+ else
+ pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval));
+
+ folio = damon_get_folio(pfn);
if (!folio)
return;
- if (pmdp_clear_young_notify(vma, addr, pmd))
+ if (likely(pmd_present(pmdval)))
+ young |= pmdp_clear_young_notify(vma, addr, pmd);
+ young |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE);
+ if (young)
folio_set_young(folio);
folio_set_idle(folio);
@@ -162,21 +174,17 @@ void damon_folio_mkold(struct folio *folio)
.rmap_one = damon_folio_mkold_one,
.anon_lock = folio_lock_anon_vma_read,
};
- bool need_lock;
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
folio_set_idle(folio);
return;
}
- need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
- if (need_lock && !folio_trylock(folio))
+ if (!folio_trylock(folio))
return;
rmap_walk(folio, &rwc);
-
- if (need_lock)
- folio_unlock(folio);
+ folio_unlock(folio);
}
@@ -203,7 +211,9 @@ static bool damon_folio_young_one(struct folio *folio,
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- *accessed = pmd_young(pmdp_get(pvmw.pmd)) ||
+ pmd_t pmd = pmdp_get(pvmw.pmd);
+
+ *accessed = (pmd_present(pmd) && pmd_young(pmd)) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
#else
@@ -228,7 +238,6 @@ bool damon_folio_young(struct folio *folio)
.rmap_one = damon_folio_young_one,
.anon_lock = folio_lock_anon_vma_read,
};
- bool need_lock;
if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
if (folio_test_idle(folio))
@@ -237,14 +246,11 @@ bool damon_folio_young(struct folio *folio)
return true;
}
- need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
- if (need_lock && !folio_trylock(folio))
+ if (!folio_trylock(folio))
return false;
rmap_walk(folio, &rwc);
-
- if (need_lock)
- folio_unlock(folio);
+ folio_unlock(folio);
return accessed;
}
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 7ba3d0f9a19a..36a582e09eae 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -250,7 +250,8 @@ static int damon_reclaim_apply_parameters(void)
err = damon_set_region_biggest_system_ram_default(param_target,
&monitor_region_start,
- &monitor_region_end);
+ &monitor_region_end,
+ param_ctx->min_sz_region);
if (err)
goto out;
err = damon_commit_ctx(ctx, param_ctx);
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index bf8626859902..ed8e3629d31a 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -188,7 +188,8 @@ static struct damon_ctx *damon_stat_build_ctx(void)
if (!target)
goto free_out;
damon_add_target(ctx, target);
- if (damon_set_region_biggest_system_ram_default(target, &start, &end))
+ if (damon_set_region_biggest_system_ram_default(target, &start, &end,
+ ctx->min_sz_region))
goto free_out;
return ctx;
free_out:
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 6536f16006c9..30d20f5b3192 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -999,6 +999,7 @@ struct damos_sysfs_quota_goal {
unsigned long target_value;
unsigned long current_value;
int nid;
+ char *path;
};
static struct damos_sysfs_quota_goal *damos_sysfs_quota_goal_alloc(void)
@@ -1029,6 +1030,14 @@ struct damos_sysfs_qgoal_metric_name damos_sysfs_qgoal_metric_names[] = {
.metric = DAMOS_QUOTA_NODE_MEM_FREE_BP,
.name = "node_mem_free_bp",
},
+ {
+ .metric = DAMOS_QUOTA_NODE_MEMCG_USED_BP,
+ .name = "node_memcg_used_bp",
+ },
+ {
+ .metric = DAMOS_QUOTA_NODE_MEMCG_FREE_BP,
+ .name = "node_memcg_free_bp",
+ },
};
static ssize_t target_metric_show(struct kobject *kobj,
@@ -1112,7 +1121,6 @@ static ssize_t nid_show(struct kobject *kobj,
struct damos_sysfs_quota_goal *goal = container_of(kobj, struct
damos_sysfs_quota_goal, kobj);
- /* todo: return error if the goal is not using nid */
return sysfs_emit(buf, "%d\n", goal->nid);
}
@@ -1128,10 +1136,39 @@ static ssize_t nid_store(struct kobject *kobj,
return err ? err : count;
}
+static ssize_t path_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj,
+ struct damos_sysfs_quota_goal, kobj);
+
+ return sysfs_emit(buf, "%s\n", goal->path ? goal->path : "");
+}
+
+static ssize_t path_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damos_sysfs_quota_goal *goal = container_of(kobj,
+ struct damos_sysfs_quota_goal, kobj);
+ char *path = kmalloc_array(size_add(count, 1), sizeof(*path),
+ GFP_KERNEL);
+
+ if (!path)
+ return -ENOMEM;
+
+ strscpy(path, buf, count + 1);
+ kfree(goal->path);
+ goal->path = path;
+ return count;
+}
+
static void damos_sysfs_quota_goal_release(struct kobject *kobj)
{
- /* or, notify this release to the feed callback */
- kfree(container_of(kobj, struct damos_sysfs_quota_goal, kobj));
+ struct damos_sysfs_quota_goal *goal = container_of(kobj,
+ struct damos_sysfs_quota_goal, kobj);
+
+ kfree(goal->path);
+ kfree(goal);
}
static struct kobj_attribute damos_sysfs_quota_goal_target_metric_attr =
@@ -1146,11 +1183,15 @@ static struct kobj_attribute damos_sysfs_quota_goal_current_value_attr =
static struct kobj_attribute damos_sysfs_quota_goal_nid_attr =
__ATTR_RW_MODE(nid, 0600);
+static struct kobj_attribute damos_sysfs_quota_goal_path_attr =
+ __ATTR_RW_MODE(path, 0600);
+
static struct attribute *damos_sysfs_quota_goal_attrs[] = {
&damos_sysfs_quota_goal_target_metric_attr.attr,
&damos_sysfs_quota_goal_target_value_attr.attr,
&damos_sysfs_quota_goal_current_value_attr.attr,
&damos_sysfs_quota_goal_nid_attr.attr,
+ &damos_sysfs_quota_goal_path_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damos_sysfs_quota_goal);
@@ -2492,7 +2533,7 @@ static int damos_sysfs_add_quota_score(
struct damos_quota *quota)
{
struct damos_quota_goal *goal;
- int i;
+ int i, err;
for (i = 0; i < sysfs_goals->nr; i++) {
struct damos_sysfs_quota_goal *sysfs_goal =
@@ -2513,6 +2554,16 @@ static int damos_sysfs_add_quota_score(
case DAMOS_QUOTA_NODE_MEM_FREE_BP:
goal->nid = sysfs_goal->nid;
break;
+ case DAMOS_QUOTA_NODE_MEMCG_USED_BP:
+ case DAMOS_QUOTA_NODE_MEMCG_FREE_BP:
+ err = damon_sysfs_memcg_path_to_id(
+ sysfs_goal->path, &goal->memcg_id);
+ if (err) {
+ damos_destroy_quota_goal(goal);
+ return err;
+ }
+ goal->nid = sysfs_goal->nid;
+ break;
default:
break;
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 3c0d727788c8..e2bd2d7becdd 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -212,6 +212,7 @@ struct damon_sysfs_target {
struct kobject kobj;
struct damon_sysfs_regions *regions;
int pid;
+ bool obsolete;
};
static struct damon_sysfs_target *damon_sysfs_target_alloc(void)
@@ -263,6 +264,29 @@ static ssize_t pid_target_store(struct kobject *kobj,
return count;
}
+static ssize_t obsolete_target_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_target *target = container_of(kobj,
+ struct damon_sysfs_target, kobj);
+
+ return sysfs_emit(buf, "%c\n", target->obsolete ? 'Y' : 'N');
+}
+
+static ssize_t obsolete_target_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_target *target = container_of(kobj,
+ struct damon_sysfs_target, kobj);
+ bool obsolete;
+ int err = kstrtobool(buf, &obsolete);
+
+ if (err)
+ return err;
+ target->obsolete = obsolete;
+ return count;
+}
+
static void damon_sysfs_target_release(struct kobject *kobj)
{
kfree(container_of(kobj, struct damon_sysfs_target, kobj));
@@ -271,8 +295,12 @@ static void damon_sysfs_target_release(struct kobject *kobj)
static struct kobj_attribute damon_sysfs_target_pid_attr =
__ATTR_RW_MODE(pid_target, 0600);
+static struct kobj_attribute damon_sysfs_target_obsolete_attr =
+ __ATTR_RW_MODE(obsolete_target, 0600);
+
static struct attribute *damon_sysfs_target_attrs[] = {
&damon_sysfs_target_pid_attr.attr,
+ &damon_sysfs_target_obsolete_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(damon_sysfs_target);
@@ -1264,7 +1292,7 @@ enum damon_sysfs_cmd {
DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS,
/*
* @DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS: Update the tuned monitoring
- * intevals.
+ * intervals.
*/
DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS,
/*
@@ -1377,6 +1405,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
/* caller will destroy targets */
return -EINVAL;
}
+ t->obsolete = sys_target->obsolete;
return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region);
}
@@ -1452,6 +1481,26 @@ static struct damon_ctx *damon_sysfs_build_ctx(
struct damon_sysfs_context *sys_ctx);
/*
+ * Return a new damon_ctx for testing new parameters to commit.
+ */
+static struct damon_ctx *damon_sysfs_new_test_ctx(
+ struct damon_ctx *running_ctx)
+{
+ struct damon_ctx *test_ctx;
+ int err;
+
+ test_ctx = damon_new_ctx();
+ if (!test_ctx)
+ return NULL;
+ err = damon_commit_ctx(test_ctx, running_ctx);
+ if (err) {
+ damon_destroy_ctx(test_ctx);
+ return NULL;
+ }
+ return test_ctx;
+}
+
+/*
* damon_sysfs_commit_input() - Commit user inputs to a running kdamond.
* @kdamond: The kobject wrapper for the associated kdamond.
*
@@ -1472,7 +1521,7 @@ static int damon_sysfs_commit_input(void *data)
param_ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
if (IS_ERR(param_ctx))
return PTR_ERR(param_ctx);
- test_ctx = damon_new_ctx();
+ test_ctx = damon_sysfs_new_test_ctx(kdamond->damon_ctx);
if (!test_ctx)
return -ENOMEM;
err = damon_commit_ctx(test_ctx, param_ctx);
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index 51369e35298b..a1eff023e928 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -20,11 +20,17 @@ static void damon_test_regions(struct kunit *test)
struct damon_target *t;
r = damon_new_region(1, 2);
+ if (!r)
+ kunit_skip(test, "region alloc fail");
KUNIT_EXPECT_EQ(test, 1ul, r->ar.start);
KUNIT_EXPECT_EQ(test, 2ul, r->ar.end);
KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses);
t = damon_new_target();
+ if (!t) {
+ damon_free_region(r);
+ kunit_skip(test, "target alloc fail");
+ }
KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
damon_add_region(r, t);
@@ -52,7 +58,14 @@ static void damon_test_target(struct kunit *test)
struct damon_ctx *c = damon_new_ctx();
struct damon_target *t;
+ if (!c)
+ kunit_skip(test, "ctx alloc fail");
+
t = damon_new_target();
+ if (!t) {
+ damon_destroy_ctx(c);
+ kunit_skip(test, "target alloc fail");
+ }
KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
damon_add_target(c, t);
@@ -84,8 +97,15 @@ static void damon_test_aggregate(struct kunit *test)
struct damon_region *r;
int it, ir;
+ if (!ctx)
+ kunit_skip(test, "ctx alloc fail");
+
for (it = 0; it < 3; it++) {
t = damon_new_target();
+ if (!t) {
+ damon_destroy_ctx(ctx);
+ kunit_skip(test, "target alloc fail");
+ }
damon_add_target(ctx, t);
}
@@ -93,6 +113,10 @@ static void damon_test_aggregate(struct kunit *test)
damon_for_each_target(t, ctx) {
for (ir = 0; ir < 3; ir++) {
r = damon_new_region(saddr[it][ir], eaddr[it][ir]);
+ if (!r) {
+ damon_destroy_ctx(ctx);
+ kunit_skip(test, "region alloc fail");
+ }
r->nr_accesses = accesses[it][ir];
r->nr_accesses_bp = accesses[it][ir] * 10000;
damon_add_region(r, t);
@@ -120,12 +144,17 @@ static void damon_test_aggregate(struct kunit *test)
static void damon_test_split_at(struct kunit *test)
{
- struct damon_ctx *c = damon_new_ctx();
struct damon_target *t;
struct damon_region *r, *r_new;
t = damon_new_target();
+ if (!t)
+ kunit_skip(test, "target alloc fail");
r = damon_new_region(0, 100);
+ if (!r) {
+ damon_free_target(t);
+ kunit_skip(test, "region alloc fail");
+ }
r->nr_accesses_bp = 420000;
r->nr_accesses = 42;
r->last_nr_accesses = 15;
@@ -143,7 +172,6 @@ static void damon_test_split_at(struct kunit *test)
KUNIT_EXPECT_EQ(test, r->last_nr_accesses, r_new->last_nr_accesses);
damon_free_target(t);
- damon_destroy_ctx(c);
}
static void damon_test_merge_two(struct kunit *test)
@@ -153,11 +181,21 @@ static void damon_test_merge_two(struct kunit *test)
int i;
t = damon_new_target();
+ if (!t)
+ kunit_skip(test, "target alloc fail");
r = damon_new_region(0, 100);
+ if (!r) {
+ damon_free_target(t);
+ kunit_skip(test, "region alloc fail");
+ }
r->nr_accesses = 10;
r->nr_accesses_bp = 100000;
damon_add_region(r, t);
r2 = damon_new_region(100, 300);
+ if (!r2) {
+ damon_free_target(t);
+ kunit_skip(test, "second region alloc fail");
+ }
r2->nr_accesses = 20;
r2->nr_accesses_bp = 200000;
damon_add_region(r2, t);
@@ -203,8 +241,14 @@ static void damon_test_merge_regions_of(struct kunit *test)
int i;
t = damon_new_target();
+ if (!t)
+ kunit_skip(test, "target alloc fail");
for (i = 0; i < ARRAY_SIZE(sa); i++) {
r = damon_new_region(sa[i], ea[i]);
+ if (!r) {
+ damon_free_target(t);
+ kunit_skip(test, "region alloc fail");
+ }
r->nr_accesses = nrs[i];
r->nr_accesses_bp = nrs[i] * 10000;
damon_add_region(r, t);
@@ -223,24 +267,34 @@ static void damon_test_merge_regions_of(struct kunit *test)
static void damon_test_split_regions_of(struct kunit *test)
{
- struct damon_ctx *c = damon_new_ctx();
struct damon_target *t;
struct damon_region *r;
t = damon_new_target();
+ if (!t)
+ kunit_skip(test, "target alloc fail");
r = damon_new_region(0, 22);
+ if (!r) {
+ damon_free_target(t);
+ kunit_skip(test, "region alloc fail");
+ }
damon_add_region(r, t);
- damon_split_regions_of(t, 2, DAMON_MIN_REGION);
+ damon_split_regions_of(t, 2, 1);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
t = damon_new_target();
+ if (!t)
+ kunit_skip(test, "second target alloc fail");
r = damon_new_region(0, 220);
+ if (!r) {
+ damon_free_target(t);
+ kunit_skip(test, "second region alloc fail");
+ }
damon_add_region(r, t);
- damon_split_regions_of(t, 4, DAMON_MIN_REGION);
+ damon_split_regions_of(t, 4, 1);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
- damon_destroy_ctx(c);
}
static void damon_test_ops_registration(struct kunit *test)
@@ -249,6 +303,9 @@ static void damon_test_ops_registration(struct kunit *test)
struct damon_operations ops = {.id = DAMON_OPS_VADDR}, bak;
bool need_cleanup = false;
+ if (!c)
+ kunit_skip(test, "ctx alloc fail");
+
/* DAMON_OPS_VADDR is registered only if CONFIG_DAMON_VADDR is set */
if (!damon_is_registered_ops(DAMON_OPS_VADDR)) {
bak.id = DAMON_OPS_VADDR;
@@ -294,16 +351,29 @@ static void damon_test_ops_registration(struct kunit *test)
static void damon_test_set_regions(struct kunit *test)
{
struct damon_target *t = damon_new_target();
- struct damon_region *r1 = damon_new_region(4, 16);
- struct damon_region *r2 = damon_new_region(24, 32);
+ struct damon_region *r1, *r2;
struct damon_addr_range range = {.start = 8, .end = 28};
unsigned long expects[] = {8, 16, 16, 24, 24, 28};
int expect_idx = 0;
struct damon_region *r;
+ if (!t)
+ kunit_skip(test, "target alloc fail");
+ r1 = damon_new_region(4, 16);
+ if (!r1) {
+ damon_free_target(t);
+ kunit_skip(test, "region alloc fail");
+ }
+ r2 = damon_new_region(24, 32);
+ if (!r2) {
+ damon_free_target(t);
+ damon_free_region(r1);
+ kunit_skip(test, "second region alloc fail");
+ }
+
damon_add_region(r1, t);
damon_add_region(r2, t);
- damon_set_regions(t, &range, 1, DAMON_MIN_REGION);
+ damon_set_regions(t, &range, 1, 1);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3);
damon_for_each_region(r, t) {
@@ -342,6 +412,9 @@ static void damon_test_update_monitoring_result(struct kunit *test)
struct damon_attrs new_attrs;
struct damon_region *r = damon_new_region(3, 7);
+ if (!r)
+ kunit_skip(test, "region alloc fail");
+
r->nr_accesses = 15;
r->nr_accesses_bp = 150000;
r->age = 20;
@@ -375,6 +448,9 @@ static void damon_test_set_attrs(struct kunit *test)
.sample_interval = 5000, .aggr_interval = 100000,};
struct damon_attrs invalid_attrs;
+ if (!c)
+ kunit_skip(test, "ctx alloc fail");
+
KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &valid_attrs), 0);
invalid_attrs = valid_attrs;
@@ -412,6 +488,8 @@ static void damos_test_new_filter(struct kunit *test)
struct damos_filter *filter;
filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, false);
+ if (!filter)
+ kunit_skip(test, "filter alloc fail");
KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON);
KUNIT_EXPECT_EQ(test, filter->matching, true);
KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list);
@@ -419,20 +497,535 @@ static void damos_test_new_filter(struct kunit *test)
damos_destroy_filter(filter);
}
+static void damos_test_commit_quota_goal_for(struct kunit *test,
+ struct damos_quota_goal *dst,
+ struct damos_quota_goal *src)
+{
+ u64 dst_last_psi_total = 0;
+
+ if (dst->metric == DAMOS_QUOTA_SOME_MEM_PSI_US)
+ dst_last_psi_total = dst->last_psi_total;
+ damos_commit_quota_goal(dst, src);
+
+ KUNIT_EXPECT_EQ(test, dst->metric, src->metric);
+ KUNIT_EXPECT_EQ(test, dst->target_value, src->target_value);
+ if (src->metric == DAMOS_QUOTA_USER_INPUT)
+ KUNIT_EXPECT_EQ(test, dst->current_value, src->current_value);
+ if (dst_last_psi_total && src->metric == DAMOS_QUOTA_SOME_MEM_PSI_US)
+ KUNIT_EXPECT_EQ(test, dst->last_psi_total, dst_last_psi_total);
+ switch (dst->metric) {
+ case DAMOS_QUOTA_NODE_MEM_USED_BP:
+ case DAMOS_QUOTA_NODE_MEM_FREE_BP:
+ KUNIT_EXPECT_EQ(test, dst->nid, src->nid);
+ break;
+ case DAMOS_QUOTA_NODE_MEMCG_USED_BP:
+ case DAMOS_QUOTA_NODE_MEMCG_FREE_BP:
+ KUNIT_EXPECT_EQ(test, dst->nid, src->nid);
+ KUNIT_EXPECT_EQ(test, dst->memcg_id, src->memcg_id);
+ break;
+ default:
+ break;
+ }
+}
+
+static void damos_test_commit_quota_goal(struct kunit *test)
+{
+ struct damos_quota_goal dst = {
+ .metric = DAMOS_QUOTA_SOME_MEM_PSI_US,
+ .target_value = 1000,
+ .current_value = 123,
+ .last_psi_total = 456,
+ };
+
+ damos_test_commit_quota_goal_for(test, &dst,
+ &(struct damos_quota_goal){
+ .metric = DAMOS_QUOTA_USER_INPUT,
+ .target_value = 789,
+ .current_value = 12});
+ damos_test_commit_quota_goal_for(test, &dst,
+ &(struct damos_quota_goal){
+ .metric = DAMOS_QUOTA_NODE_MEM_FREE_BP,
+ .target_value = 345,
+ .current_value = 678,
+ .nid = 9,
+ });
+ damos_test_commit_quota_goal_for(test, &dst,
+ &(struct damos_quota_goal){
+ .metric = DAMOS_QUOTA_NODE_MEM_USED_BP,
+ .target_value = 12,
+ .current_value = 345,
+ .nid = 6,
+ });
+ damos_test_commit_quota_goal_for(test, &dst,
+ &(struct damos_quota_goal){
+ .metric = DAMOS_QUOTA_NODE_MEMCG_USED_BP,
+ .target_value = 456,
+ .current_value = 567,
+ .nid = 6,
+ .memcg_id = 7,
+ });
+ damos_test_commit_quota_goal_for(test, &dst,
+ &(struct damos_quota_goal){
+ .metric = DAMOS_QUOTA_NODE_MEMCG_FREE_BP,
+ .target_value = 890,
+ .current_value = 901,
+ .nid = 10,
+ .memcg_id = 1,
+ });
+ damos_test_commit_quota_goal_for(test, &dst,
+ &(struct damos_quota_goal) {
+ .metric = DAMOS_QUOTA_USER_INPUT,
+ .target_value = 789,
+ .current_value = 12,
+ });
+}
+
+static void damos_test_commit_quota_goals_for(struct kunit *test,
+ struct damos_quota_goal *dst_goals, int nr_dst_goals,
+ struct damos_quota_goal *src_goals, int nr_src_goals)
+{
+ struct damos_quota dst, src;
+ struct damos_quota_goal *goal, *next;
+ bool skip = true;
+ int i;
+
+ INIT_LIST_HEAD(&dst.goals);
+ INIT_LIST_HEAD(&src.goals);
+
+ for (i = 0; i < nr_dst_goals; i++) {
+ /*
+ * When nr_src_goals is smaller than dst_goals,
+ * damos_commit_quota_goals() will kfree() the dst goals.
+ * Make it kfree()-able.
+ */
+ goal = damos_new_quota_goal(dst_goals[i].metric,
+ dst_goals[i].target_value);
+ if (!goal)
+ goto out;
+ damos_add_quota_goal(&dst, goal);
+ }
+ skip = false;
+ for (i = 0; i < nr_src_goals; i++)
+ damos_add_quota_goal(&src, &src_goals[i]);
+
+ damos_commit_quota_goals(&dst, &src);
+
+ i = 0;
+ damos_for_each_quota_goal(goal, (&dst)) {
+ KUNIT_EXPECT_EQ(test, goal->metric, src_goals[i].metric);
+ KUNIT_EXPECT_EQ(test, goal->target_value,
+ src_goals[i++].target_value);
+ }
+ KUNIT_EXPECT_EQ(test, i, nr_src_goals);
+
+out:
+ damos_for_each_quota_goal_safe(goal, next, (&dst))
+ damos_destroy_quota_goal(goal);
+ if (skip)
+ kunit_skip(test, "goal alloc fail");
+}
+
+static void damos_test_commit_quota_goals(struct kunit *test)
+{
+ damos_test_commit_quota_goals_for(test,
+ (struct damos_quota_goal[]){}, 0,
+ (struct damos_quota_goal[]){
+ {
+ .metric = DAMOS_QUOTA_USER_INPUT,
+ .target_value = 123,
+ },
+ }, 1);
+ damos_test_commit_quota_goals_for(test,
+ (struct damos_quota_goal[]){
+ {
+ .metric = DAMOS_QUOTA_USER_INPUT,
+ .target_value = 234,
+ },
+
+ }, 1,
+ (struct damos_quota_goal[]){
+ {
+ .metric = DAMOS_QUOTA_USER_INPUT,
+ .target_value = 345,
+ },
+ }, 1);
+ damos_test_commit_quota_goals_for(test,
+ (struct damos_quota_goal[]){
+ {
+ .metric = DAMOS_QUOTA_USER_INPUT,
+ .target_value = 456,
+ },
+
+ }, 1,
+ (struct damos_quota_goal[]){}, 0);
+}
+
+static void damos_test_commit_quota(struct kunit *test)
+{
+ struct damos_quota dst = {
+ .reset_interval = 1,
+ .ms = 2,
+ .sz = 3,
+ .weight_sz = 4,
+ .weight_nr_accesses = 5,
+ .weight_age = 6,
+ };
+ struct damos_quota src = {
+ .reset_interval = 7,
+ .ms = 8,
+ .sz = 9,
+ .weight_sz = 10,
+ .weight_nr_accesses = 11,
+ .weight_age = 12,
+ };
+
+ INIT_LIST_HEAD(&dst.goals);
+ INIT_LIST_HEAD(&src.goals);
+
+ damos_commit_quota(&dst, &src);
+
+ KUNIT_EXPECT_EQ(test, dst.reset_interval, src.reset_interval);
+ KUNIT_EXPECT_EQ(test, dst.ms, src.ms);
+ KUNIT_EXPECT_EQ(test, dst.sz, src.sz);
+ KUNIT_EXPECT_EQ(test, dst.weight_sz, src.weight_sz);
+ KUNIT_EXPECT_EQ(test, dst.weight_nr_accesses, src.weight_nr_accesses);
+ KUNIT_EXPECT_EQ(test, dst.weight_age, src.weight_age);
+}
+
+static int damos_test_help_dests_setup(struct damos_migrate_dests *dests,
+ unsigned int *node_id_arr, unsigned int *weight_arr,
+ size_t nr_dests)
+{
+ size_t i;
+
+ dests->node_id_arr = kmalloc_array(nr_dests,
+ sizeof(*dests->node_id_arr), GFP_KERNEL);
+ if (!dests->node_id_arr)
+ return -ENOMEM;
+ dests->weight_arr = kmalloc_array(nr_dests,
+ sizeof(*dests->weight_arr), GFP_KERNEL);
+ if (!dests->weight_arr) {
+ kfree(dests->node_id_arr);
+ dests->node_id_arr = NULL;
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_dests; i++) {
+ dests->node_id_arr[i] = node_id_arr[i];
+ dests->weight_arr[i] = weight_arr[i];
+ }
+ dests->nr_dests = nr_dests;
+ return 0;
+}
+
+static void damos_test_help_dests_free(struct damos_migrate_dests *dests)
+{
+ kfree(dests->node_id_arr);
+ kfree(dests->weight_arr);
+}
+
+static void damos_test_commit_dests_for(struct kunit *test,
+ unsigned int *dst_node_id_arr, unsigned int *dst_weight_arr,
+ size_t dst_nr_dests,
+ unsigned int *src_node_id_arr, unsigned int *src_weight_arr,
+ size_t src_nr_dests)
+{
+ struct damos_migrate_dests dst = {}, src = {};
+ int i, err;
+ bool skip = true;
+
+ err = damos_test_help_dests_setup(&dst, dst_node_id_arr,
+ dst_weight_arr, dst_nr_dests);
+ if (err)
+ kunit_skip(test, "dests setup fail");
+ err = damos_test_help_dests_setup(&src, src_node_id_arr,
+ src_weight_arr, src_nr_dests);
+ if (err) {
+ damos_test_help_dests_free(&dst);
+ kunit_skip(test, "src setup fail");
+ }
+ err = damos_commit_dests(&dst, &src);
+ if (err)
+ goto out;
+ skip = false;
+
+ KUNIT_EXPECT_EQ(test, dst.nr_dests, src_nr_dests);
+ for (i = 0; i < dst.nr_dests; i++) {
+ KUNIT_EXPECT_EQ(test, dst.node_id_arr[i], src_node_id_arr[i]);
+ KUNIT_EXPECT_EQ(test, dst.weight_arr[i], src_weight_arr[i]);
+ }
+
+out:
+ damos_test_help_dests_free(&dst);
+ damos_test_help_dests_free(&src);
+ if (skip)
+ kunit_skip(test, "skip");
+}
+
+static void damos_test_commit_dests(struct kunit *test)
+{
+ damos_test_commit_dests_for(test,
+ (unsigned int[]){1, 2, 3}, (unsigned int[]){2, 3, 4},
+ 3,
+ (unsigned int[]){4, 5, 6}, (unsigned int[]){5, 6, 7},
+ 3);
+ damos_test_commit_dests_for(test,
+ (unsigned int[]){1, 2}, (unsigned int[]){2, 3},
+ 2,
+ (unsigned int[]){4, 5, 6}, (unsigned int[]){5, 6, 7},
+ 3);
+ damos_test_commit_dests_for(test,
+ NULL, NULL, 0,
+ (unsigned int[]){4, 5, 6}, (unsigned int[]){5, 6, 7},
+ 3);
+ damos_test_commit_dests_for(test,
+ (unsigned int[]){1, 2, 3}, (unsigned int[]){2, 3, 4},
+ 3,
+ (unsigned int[]){4, 5}, (unsigned int[]){5, 6}, 2);
+ damos_test_commit_dests_for(test,
+ (unsigned int[]){1, 2, 3}, (unsigned int[]){2, 3, 4},
+ 3,
+ NULL, NULL, 0);
+}
+
+static void damos_test_commit_filter_for(struct kunit *test,
+ struct damos_filter *dst, struct damos_filter *src)
+{
+ damos_commit_filter(dst, src);
+ KUNIT_EXPECT_EQ(test, dst->type, src->type);
+ KUNIT_EXPECT_EQ(test, dst->matching, src->matching);
+ KUNIT_EXPECT_EQ(test, dst->allow, src->allow);
+ switch (src->type) {
+ case DAMOS_FILTER_TYPE_MEMCG:
+ KUNIT_EXPECT_EQ(test, dst->memcg_id, src->memcg_id);
+ break;
+ case DAMOS_FILTER_TYPE_ADDR:
+ KUNIT_EXPECT_EQ(test, dst->addr_range.start,
+ src->addr_range.start);
+ KUNIT_EXPECT_EQ(test, dst->addr_range.end,
+ src->addr_range.end);
+ break;
+ case DAMOS_FILTER_TYPE_TARGET:
+ KUNIT_EXPECT_EQ(test, dst->target_idx, src->target_idx);
+ break;
+ case DAMOS_FILTER_TYPE_HUGEPAGE_SIZE:
+ KUNIT_EXPECT_EQ(test, dst->sz_range.min, src->sz_range.min);
+ KUNIT_EXPECT_EQ(test, dst->sz_range.max, src->sz_range.max);
+ break;
+ default:
+ break;
+ }
+}
+
static void damos_test_commit_filter(struct kunit *test)
{
- struct damos_filter *src_filter = damos_new_filter(
- DAMOS_FILTER_TYPE_ANON, true, true);
- struct damos_filter *dst_filter = damos_new_filter(
- DAMOS_FILTER_TYPE_ACTIVE, false, false);
+ struct damos_filter dst = {
+ .type = DAMOS_FILTER_TYPE_ACTIVE,
+ .matching = false,
+ .allow = false,
+ };
+
+ damos_test_commit_filter_for(test, &dst,
+ &(struct damos_filter){
+ .type = DAMOS_FILTER_TYPE_ANON,
+ .matching = true,
+ .allow = true,
+ });
+ damos_test_commit_filter_for(test, &dst,
+ &(struct damos_filter){
+ .type = DAMOS_FILTER_TYPE_MEMCG,
+ .matching = false,
+ .allow = false,
+ .memcg_id = 123,
+ });
+ damos_test_commit_filter_for(test, &dst,
+ &(struct damos_filter){
+ .type = DAMOS_FILTER_TYPE_YOUNG,
+ .matching = true,
+ .allow = true,
+ });
+ damos_test_commit_filter_for(test, &dst,
+ &(struct damos_filter){
+ .type = DAMOS_FILTER_TYPE_HUGEPAGE_SIZE,
+ .matching = false,
+ .allow = false,
+ .sz_range = {.min = 234, .max = 345},
+ });
+ damos_test_commit_filter_for(test, &dst,
+ &(struct damos_filter){
+ .type = DAMOS_FILTER_TYPE_UNMAPPED,
+ .matching = true,
+ .allow = true,
+ });
+ damos_test_commit_filter_for(test, &dst,
+ &(struct damos_filter){
+ .type = DAMOS_FILTER_TYPE_ADDR,
+ .matching = false,
+ .allow = false,
+ .addr_range = {.start = 456, .end = 567},
+ });
+ damos_test_commit_filter_for(test, &dst,
+ &(struct damos_filter){
+ .type = DAMOS_FILTER_TYPE_TARGET,
+ .matching = true,
+ .allow = true,
+ .target_idx = 6,
+ });
+}
+
+static void damos_test_help_initailize_scheme(struct damos *scheme)
+{
+ INIT_LIST_HEAD(&scheme->quota.goals);
+ INIT_LIST_HEAD(&scheme->core_filters);
+ INIT_LIST_HEAD(&scheme->ops_filters);
+}
- damos_commit_filter(dst_filter, src_filter);
- KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type);
- KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching);
- KUNIT_EXPECT_EQ(test, dst_filter->allow, src_filter->allow);
+static void damos_test_commit_for(struct kunit *test, struct damos *dst,
+ struct damos *src)
+{
+ int err;
+
+ damos_test_help_initailize_scheme(dst);
+ damos_test_help_initailize_scheme(src);
+
+ err = damos_commit(dst, src);
+ if (err)
+ kunit_skip(test, "damos_commit fail");
+
+ KUNIT_EXPECT_EQ(test, dst->pattern.min_sz_region,
+ src->pattern.min_sz_region);
+ KUNIT_EXPECT_EQ(test, dst->pattern.max_sz_region,
+ src->pattern.max_sz_region);
+ KUNIT_EXPECT_EQ(test, dst->pattern.min_nr_accesses,
+ src->pattern.min_nr_accesses);
+ KUNIT_EXPECT_EQ(test, dst->pattern.max_nr_accesses,
+ src->pattern.max_nr_accesses);
+ KUNIT_EXPECT_EQ(test, dst->pattern.min_age_region,
+ src->pattern.min_age_region);
+ KUNIT_EXPECT_EQ(test, dst->pattern.max_age_region,
+ src->pattern.max_age_region);
+
+ KUNIT_EXPECT_EQ(test, dst->action, src->action);
+ KUNIT_EXPECT_EQ(test, dst->apply_interval_us, src->apply_interval_us);
+
+ KUNIT_EXPECT_EQ(test, dst->wmarks.metric, src->wmarks.metric);
+ KUNIT_EXPECT_EQ(test, dst->wmarks.interval, src->wmarks.interval);
+ KUNIT_EXPECT_EQ(test, dst->wmarks.high, src->wmarks.high);
+ KUNIT_EXPECT_EQ(test, dst->wmarks.mid, src->wmarks.mid);
+ KUNIT_EXPECT_EQ(test, dst->wmarks.low, src->wmarks.low);
+
+ switch (src->action) {
+ case DAMOS_MIGRATE_COLD:
+ case DAMOS_MIGRATE_HOT:
+ KUNIT_EXPECT_EQ(test, dst->target_nid, src->target_nid);
+ break;
+ default:
+ break;
+ }
+}
- damos_destroy_filter(src_filter);
- damos_destroy_filter(dst_filter);
+static void damos_test_commit(struct kunit *test)
+{
+ damos_test_commit_for(test,
+ &(struct damos){
+ .pattern = (struct damos_access_pattern){
+ 1, 2, 3, 4, 5, 6},
+ .action = DAMOS_PAGEOUT,
+ .apply_interval_us = 1000000,
+ .wmarks = (struct damos_watermarks){
+ DAMOS_WMARK_FREE_MEM_RATE,
+ 900, 100, 50},
+ },
+ &(struct damos){
+ .pattern = (struct damos_access_pattern){
+ 2, 3, 4, 5, 6, 7},
+ .action = DAMOS_PAGEOUT,
+ .apply_interval_us = 2000000,
+ .wmarks = (struct damos_watermarks){
+ DAMOS_WMARK_FREE_MEM_RATE,
+ 800, 50, 30},
+ });
+ damos_test_commit_for(test,
+ &(struct damos){
+ .pattern = (struct damos_access_pattern){
+ 1, 2, 3, 4, 5, 6},
+ .action = DAMOS_PAGEOUT,
+ .apply_interval_us = 1000000,
+ .wmarks = (struct damos_watermarks){
+ DAMOS_WMARK_FREE_MEM_RATE,
+ 900, 100, 50},
+ },
+ &(struct damos){
+ .pattern = (struct damos_access_pattern){
+ 2, 3, 4, 5, 6, 7},
+ .action = DAMOS_MIGRATE_HOT,
+ .apply_interval_us = 2000000,
+ .target_nid = 5,
+ });
+}
+
+static struct damon_target *damon_test_help_setup_target(
+ unsigned long region_start_end[][2], int nr_regions)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ int i;
+
+ t = damon_new_target();
+ if (!t)
+ return NULL;
+ for (i = 0; i < nr_regions; i++) {
+ r = damon_new_region(region_start_end[i][0],
+ region_start_end[i][1]);
+ if (!r) {
+ damon_free_target(t);
+ return NULL;
+ }
+ damon_add_region(r, t);
+ }
+ return t;
+}
+
+static void damon_test_commit_target_regions_for(struct kunit *test,
+ unsigned long dst_start_end[][2], int nr_dst_regions,
+ unsigned long src_start_end[][2], int nr_src_regions,
+ unsigned long expect_start_end[][2], int nr_expect_regions)
+{
+ struct damon_target *dst_target, *src_target;
+ struct damon_region *r;
+ int i;
+
+ dst_target = damon_test_help_setup_target(dst_start_end, nr_dst_regions);
+ if (!dst_target)
+ kunit_skip(test, "dst target setup fail");
+ src_target = damon_test_help_setup_target(src_start_end, nr_src_regions);
+ if (!src_target) {
+ damon_free_target(dst_target);
+ kunit_skip(test, "src target setup fail");
+ }
+ damon_commit_target_regions(dst_target, src_target, 1);
+ i = 0;
+ damon_for_each_region(r, dst_target) {
+ KUNIT_EXPECT_EQ(test, r->ar.start, expect_start_end[i][0]);
+ KUNIT_EXPECT_EQ(test, r->ar.end, expect_start_end[i][1]);
+ i++;
+ }
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(dst_target), nr_expect_regions);
+ KUNIT_EXPECT_EQ(test, i, nr_expect_regions);
+ damon_free_target(dst_target);
+ damon_free_target(src_target);
+}
+
+static void damon_test_commit_target_regions(struct kunit *test)
+{
+ damon_test_commit_target_regions_for(test,
+ (unsigned long[][2]) {{3, 8}, {8, 10}}, 2,
+ (unsigned long[][2]) {{4, 6}}, 1,
+ (unsigned long[][2]) {{4, 6}}, 1);
+ damon_test_commit_target_regions_for(test,
+ (unsigned long[][2]) {{3, 8}, {8, 10}}, 2,
+ (unsigned long[][2]) {}, 0,
+ (unsigned long[][2]) {{3, 8}, {8, 10}}, 2);
}
static void damos_test_filter_out(struct kunit *test)
@@ -442,58 +1035,66 @@ static void damos_test_filter_out(struct kunit *test)
struct damos_filter *f;
f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true, false);
- f->addr_range = (struct damon_addr_range){
- .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6};
+ if (!f)
+ kunit_skip(test, "filter alloc fail");
+ f->addr_range = (struct damon_addr_range){.start = 2, .end = 6};
t = damon_new_target();
- r = damon_new_region(DAMON_MIN_REGION * 3, DAMON_MIN_REGION * 5);
+ if (!t) {
+ damos_destroy_filter(f);
+ kunit_skip(test, "target alloc fail");
+ }
+ r = damon_new_region(3, 5);
+ if (!r) {
+ damos_destroy_filter(f);
+ damon_free_target(t);
+ kunit_skip(test, "region alloc fail");
+ }
damon_add_region(r, t);
/* region in the range */
- KUNIT_EXPECT_TRUE(test,
- damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
+ KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f, 1));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region before the range */
- r->ar.start = DAMON_MIN_REGION * 1;
- r->ar.end = DAMON_MIN_REGION * 2;
+ r->ar.start = 1;
+ r->ar.end = 2;
KUNIT_EXPECT_FALSE(test,
- damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
+ damos_filter_match(NULL, t, r, f, 1));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region after the range */
- r->ar.start = DAMON_MIN_REGION * 6;
- r->ar.end = DAMON_MIN_REGION * 8;
+ r->ar.start = 6;
+ r->ar.end = 8;
KUNIT_EXPECT_FALSE(test,
- damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
+ damos_filter_match(NULL, t, r, f, 1));
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1);
/* region started before the range */
- r->ar.start = DAMON_MIN_REGION * 1;
- r->ar.end = DAMON_MIN_REGION * 4;
- KUNIT_EXPECT_FALSE(test,
- damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
+ r->ar.start = 1;
+ r->ar.end = 4;
+ KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f, 1));
/* filter should have split the region */
- KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1);
- KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2);
+ KUNIT_EXPECT_EQ(test, r->ar.start, 1);
+ KUNIT_EXPECT_EQ(test, r->ar.end, 2);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2);
r2 = damon_next_region(r);
- KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 2);
- KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 4);
+ KUNIT_EXPECT_EQ(test, r2->ar.start, 2);
+ KUNIT_EXPECT_EQ(test, r2->ar.end, 4);
damon_destroy_region(r2, t);
/* region started in the range */
- r->ar.start = DAMON_MIN_REGION * 2;
- r->ar.end = DAMON_MIN_REGION * 8;
+ r->ar.start = 2;
+ r->ar.end = 8;
KUNIT_EXPECT_TRUE(test,
- damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION));
+ damos_filter_match(NULL, t, r, f, 1));
/* filter should have split the region */
- KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2);
- KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6);
+ KUNIT_EXPECT_EQ(test, r->ar.start, 2);
+ KUNIT_EXPECT_EQ(test, r->ar.end, 6);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2);
r2 = damon_next_region(r);
- KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 6);
- KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 8);
+ KUNIT_EXPECT_EQ(test, r2->ar.start, 6);
+ KUNIT_EXPECT_EQ(test, r2->ar.end, 8);
damon_destroy_region(r2, t);
damon_free_target(t);
@@ -536,7 +1137,7 @@ static void damon_test_set_filters_default_reject(struct kunit *test)
struct damos scheme;
struct damos_filter *target_filter, *anon_filter;
- INIT_LIST_HEAD(&scheme.filters);
+ INIT_LIST_HEAD(&scheme.core_filters);
INIT_LIST_HEAD(&scheme.ops_filters);
damos_set_filters_default_reject(&scheme);
@@ -548,6 +1149,8 @@ static void damon_test_set_filters_default_reject(struct kunit *test)
KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true);
+ if (!target_filter)
+ kunit_skip(test, "filter alloc fail");
damos_add_filter(&scheme, target_filter);
damos_set_filters_default_reject(&scheme);
/*
@@ -573,6 +1176,10 @@ static void damon_test_set_filters_default_reject(struct kunit *test)
KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false);
anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true);
+ if (!anon_filter) {
+ damos_free_filter(target_filter);
+ kunit_skip(test, "anon_filter alloc fail");
+ }
damos_add_filter(&scheme, anon_filter);
damos_set_filters_default_reject(&scheme);
@@ -598,6 +1205,9 @@ static void damon_test_set_filters_default_reject(struct kunit *test)
*/
KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false);
KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true);
+
+ damos_free_filter(anon_filter);
+ damos_free_filter(target_filter);
}
static struct kunit_case damon_test_cases[] = {
@@ -615,7 +1225,13 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_set_attrs),
KUNIT_CASE(damon_test_moving_sum),
KUNIT_CASE(damos_test_new_filter),
+ KUNIT_CASE(damos_test_commit_quota_goal),
+ KUNIT_CASE(damos_test_commit_quota_goals),
+ KUNIT_CASE(damos_test_commit_quota),
+ KUNIT_CASE(damos_test_commit_dests),
KUNIT_CASE(damos_test_commit_filter),
+ KUNIT_CASE(damos_test_commit),
+ KUNIT_CASE(damon_test_commit_target_regions),
KUNIT_CASE(damos_test_filter_out),
KUNIT_CASE(damon_test_feed_loop_next_input),
KUNIT_CASE(damon_test_set_filters_default_reject),
diff --git a/mm/damon/tests/sysfs-kunit.h b/mm/damon/tests/sysfs-kunit.h
index 7b5c7b307da9..0c665ed255a3 100644
--- a/mm/damon/tests/sysfs-kunit.h
+++ b/mm/damon/tests/sysfs-kunit.h
@@ -45,16 +45,41 @@ static void damon_sysfs_test_add_targets(struct kunit *test)
struct damon_ctx *ctx;
sysfs_targets = damon_sysfs_targets_alloc();
+ if (!sysfs_targets)
+ kunit_skip(test, "sysfs_targets alloc fail");
sysfs_targets->nr = 1;
sysfs_targets->targets_arr = kmalloc_array(1,
sizeof(*sysfs_targets->targets_arr), GFP_KERNEL);
+ if (!sysfs_targets->targets_arr) {
+ kfree(sysfs_targets);
+ kunit_skip(test, "targets_arr alloc fail");
+ }
sysfs_target = damon_sysfs_target_alloc();
+ if (!sysfs_target) {
+ kfree(sysfs_targets->targets_arr);
+ kfree(sysfs_targets);
+ kunit_skip(test, "sysfs_target alloc fail");
+ }
sysfs_target->pid = __damon_sysfs_test_get_any_pid(12, 100);
sysfs_target->regions = damon_sysfs_regions_alloc();
+ if (!sysfs_target->regions) {
+ kfree(sysfs_targets->targets_arr);
+ kfree(sysfs_targets);
+ kfree(sysfs_target);
+ kunit_skip(test, "sysfs_regions alloc fail");
+ }
+
sysfs_targets->targets_arr[0] = sysfs_target;
ctx = damon_new_ctx();
+ if (!ctx) {
+ kfree(sysfs_targets->targets_arr);
+ kfree(sysfs_targets);
+ kfree(sysfs_target->regions);
+ kfree(sysfs_target);
+ kunit_skip(test, "ctx alloc fail");
+ }
damon_sysfs_add_targets(ctx, sysfs_targets);
KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(ctx));
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index fce38dd53cf8..30dc5459f1d2 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -136,8 +136,14 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
int i;
t = damon_new_target();
+ if (!t)
+ kunit_skip(test, "target alloc fail");
for (i = 0; i < nr_regions / 2; i++) {
r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
+ if (!r) {
+ damon_destroy_target(t, NULL);
+ kunit_skip(test, "region alloc fail");
+ }
damon_add_region(r, t);
}
@@ -250,7 +256,16 @@ static void damon_test_split_evenly_fail(struct kunit *test,
unsigned long start, unsigned long end, unsigned int nr_pieces)
{
struct damon_target *t = damon_new_target();
- struct damon_region *r = damon_new_region(start, end);
+ struct damon_region *r;
+
+ if (!t)
+ kunit_skip(test, "target alloc fail");
+
+ r = damon_new_region(start, end);
+ if (!r) {
+ damon_free_target(t);
+ kunit_skip(test, "region alloc fail");
+ }
damon_add_region(r, t);
KUNIT_EXPECT_EQ(test,
@@ -269,10 +284,17 @@ static void damon_test_split_evenly_succ(struct kunit *test,
unsigned long start, unsigned long end, unsigned int nr_pieces)
{
struct damon_target *t = damon_new_target();
- struct damon_region *r = damon_new_region(start, end);
+ struct damon_region *r;
unsigned long expected_width = (end - start) / nr_pieces;
unsigned long i = 0;
+ if (!t)
+ kunit_skip(test, "target alloc fail");
+ r = damon_new_region(start, end);
+ if (!r) {
+ damon_free_target(t);
+ kunit_skip(test, "region alloc fail");
+ }
damon_add_region(r, t);
KUNIT_EXPECT_EQ(test,
damon_va_evenly_split_region(t, r, nr_pieces), 0);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 7e834467b2d8..2750c88e7225 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -307,24 +307,16 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
pte_t *pte;
- pmd_t pmde;
spinlock_t *ptl;
- if (pmd_trans_huge(pmdp_get(pmd))) {
- ptl = pmd_lock(walk->mm, pmd);
- pmde = pmdp_get(pmd);
+ ptl = pmd_trans_huge_lock(pmd, walk->vma);
+ if (ptl) {
+ pmd_t pmde = pmdp_get(pmd);
- if (!pmd_present(pmde)) {
- spin_unlock(ptl);
- return 0;
- }
-
- if (pmd_trans_huge(pmde)) {
+ if (pmd_present(pmde))
damon_pmdp_mkold(pmd, walk->vma, addr);
- spin_unlock(ptl);
- return 0;
- }
spin_unlock(ptl);
+ return 0;
}
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
@@ -446,22 +438,13 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
struct damon_young_walk_private *priv = walk->private;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge(pmdp_get(pmd))) {
- pmd_t pmde;
-
- ptl = pmd_lock(walk->mm, pmd);
- pmde = pmdp_get(pmd);
+ ptl = pmd_trans_huge_lock(pmd, walk->vma);
+ if (ptl) {
+ pmd_t pmde = pmdp_get(pmd);
- if (!pmd_present(pmde)) {
- spin_unlock(ptl);
- return 0;
- }
-
- if (!pmd_trans_huge(pmde)) {
- spin_unlock(ptl);
- goto regular_page;
- }
- folio = damon_get_folio(pmd_pfn(pmde));
+ if (!pmd_present(pmde))
+ goto huge_out;
+ folio = vm_normal_folio_pmd(walk->vma, addr, pmde);
if (!folio)
goto huge_out;
if (pmd_young(pmde) || !folio_test_idle(folio) ||
@@ -469,13 +452,10 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
addr))
priv->young = true;
*priv->folio_sz = HPAGE_PMD_SIZE;
- folio_put(folio);
huge_out:
spin_unlock(ptl);
return 0;
}
-
-regular_page:
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
@@ -484,14 +464,13 @@ regular_page:
ptent = ptep_get(pte);
if (!pte_present(ptent))
goto out;
- folio = damon_get_folio(pte_pfn(ptent));
+ folio = vm_normal_folio(walk->vma, addr, ptent);
if (!folio)
goto out;
if (pte_young(ptent) || !folio_test_idle(folio) ||
mmu_notifier_test_young(walk->mm, addr))
priv->young = true;
*priv->folio_sz = folio_size(folio);
- folio_put(folio);
out:
pte_unmap_unlock(pte, ptl);
return 0;
@@ -718,7 +697,6 @@ isolate:
list_add(&folio->lru, &migration_lists[i]);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
@@ -728,63 +706,49 @@ static int damos_va_migrate_pmd_entry(pmd_t *pmd, unsigned long addr,
struct damos_migrate_dests *dests = &s->migrate_dests;
struct folio *folio;
spinlock_t *ptl;
- pmd_t pmde;
-
- ptl = pmd_lock(walk->mm, pmd);
- pmde = pmdp_get(pmd);
-
- if (!pmd_present(pmde) || !pmd_trans_huge(pmde))
- goto unlock;
-
- /* Tell page walk code to not split the PMD */
- walk->action = ACTION_CONTINUE;
-
- folio = damon_get_folio(pmd_pfn(pmde));
- if (!folio)
- goto unlock;
-
- if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd))
- goto put_folio;
-
- damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
- migration_lists);
-
-put_folio:
- folio_put(folio);
-unlock:
- spin_unlock(ptl);
- return 0;
-}
-#else
-#define damos_va_migrate_pmd_entry NULL
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ pte_t *start_pte, *pte, ptent;
+ int nr;
-static int damos_va_migrate_pte_entry(pte_t *pte, unsigned long addr,
- unsigned long next, struct mm_walk *walk)
-{
- struct damos_va_migrate_private *priv = walk->private;
- struct list_head *migration_lists = priv->migration_lists;
- struct damos *s = priv->scheme;
- struct damos_migrate_dests *dests = &s->migrate_dests;
- struct folio *folio;
- pte_t ptent;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ ptl = pmd_trans_huge_lock(pmd, walk->vma);
+ if (ptl) {
+ pmd_t pmde = pmdp_get(pmd);
- ptent = ptep_get(pte);
- if (pte_none(ptent) || !pte_present(ptent))
+ if (!pmd_present(pmde))
+ goto huge_out;
+ folio = vm_normal_folio_pmd(walk->vma, addr, pmde);
+ if (!folio)
+ goto huge_out;
+ if (damos_va_filter_out(s, folio, walk->vma, addr, NULL, pmd))
+ goto huge_out;
+ damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
+ migration_lists);
+huge_out:
+ spin_unlock(ptl);
return 0;
+ }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- folio = damon_get_folio(pte_pfn(ptent));
- if (!folio)
+ start_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ if (!pte)
return 0;
- if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL))
- goto put_folio;
-
- damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
- migration_lists);
+ for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) {
+ nr = 1;
+ ptent = ptep_get(pte);
-put_folio:
- folio_put(folio);
+ if (pte_none(ptent) || !pte_present(ptent))
+ continue;
+ folio = vm_normal_folio(walk->vma, addr, ptent);
+ if (!folio)
+ continue;
+ if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL))
+ return 0;
+ damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
+ migration_lists);
+ nr = folio_nr_pages(folio);
+ }
+ pte_unmap_unlock(start_pte, ptl);
return 0;
}
@@ -850,7 +814,7 @@ static unsigned long damos_va_migrate(struct damon_target *target,
struct damos_migrate_dests *dests = &s->migrate_dests;
struct mm_walk_ops walk_ops = {
.pmd_entry = damos_va_migrate_pmd_entry,
- .pte_entry = damos_va_migrate_pte_entry,
+ .pte_entry = NULL,
.walk_lock = PGWALK_RDLOCK,
};
@@ -910,13 +874,10 @@ static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr,
int nr;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge(*pmd)) {
- pmd_t pmde;
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ pmd_t pmde = pmdp_get(pmd);
- ptl = pmd_trans_huge_lock(pmd, vma);
- if (!ptl)
- return 0;
- pmde = pmdp_get(pmd);
if (!pmd_present(pmde))
goto huge_unlock;
diff --git a/mm/debug.c b/mm/debug.c
index 64ddb0c4b4be..77fa8fe1d641 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -67,7 +67,7 @@ static const char *page_type_name(unsigned int page_type)
return page_type_names[i];
}
-static void __dump_folio(struct folio *folio, struct page *page,
+static void __dump_folio(const struct folio *folio, const struct page *page,
unsigned long pfn, unsigned long idx)
{
struct address_space *mapping = folio_mapping(folio);
@@ -327,7 +327,7 @@ static int __init setup_vm_debug(char *str)
while (*str) {
switch (tolower(*str)) {
- case'p':
+ case 'p':
__page_init_poisoning = true;
break;
default:
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 830107b6dd08..ae9b9310d96f 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -25,14 +25,14 @@
#include <linux/random.h>
#include <linux/spinlock.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
#include <linux/io.h>
#include <linux/vmalloc.h>
+#include <linux/pgalloc.h>
#include <asm/cacheflush.h>
-#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
/*
@@ -74,6 +74,7 @@ struct pgtable_debug_args {
unsigned long fixed_pte_pfn;
swp_entry_t swp_entry;
+ swp_entry_t leaf_entry;
};
static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
@@ -102,6 +103,12 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte, args->vma))));
WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte))));
WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte))));
+
+ WARN_ON(!pte_dirty(pte_mkwrite_novma(pte_mkdirty(pte))));
+ WARN_ON(pte_dirty(pte_mkwrite_novma(pte_mkclean(pte))));
+ WARN_ON(!pte_write(pte_mkdirty(pte_mkwrite_novma(pte))));
+ WARN_ON(!pte_write(pte_mkwrite_novma(pte_wrprotect(pte))));
+ WARN_ON(pte_write(pte_wrprotect(pte_mkwrite_novma(pte))));
}
static void __init pte_advanced_tests(struct pgtable_debug_args *args)
@@ -195,6 +202,13 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx)
WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd, args->vma))));
WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd))));
WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd))));
+
+ WARN_ON(!pmd_dirty(pmd_mkwrite_novma(pmd_mkdirty(pmd))));
+ WARN_ON(pmd_dirty(pmd_mkwrite_novma(pmd_mkclean(pmd))));
+ WARN_ON(!pmd_write(pmd_mkdirty(pmd_mkwrite_novma(pmd))));
+ WARN_ON(!pmd_write(pmd_mkwrite_novma(pmd_wrprotect(pmd))));
+ WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite_novma(pmd))));
+
/*
* A huge page does not point to next level page table
* entry. Hence this must qualify as pmd_bad().
@@ -690,7 +704,7 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
{
pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
- if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ if (!pgtable_supports_soft_dirty())
return;
pr_debug("Validating PTE soft dirty\n");
@@ -701,14 +715,16 @@ static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args)
{
pte_t pte;
+ softleaf_t entry;
- if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ if (!pgtable_supports_soft_dirty())
return;
pr_debug("Validating PTE swap soft dirty\n");
pte = swp_entry_to_pte(args->swp_entry);
- WARN_ON(!is_swap_pte(pte));
+ entry = softleaf_from_pte(pte);
+ WARN_ON(!softleaf_is_swap(entry));
WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
}
@@ -718,7 +734,7 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
- if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ if (!pgtable_supports_soft_dirty())
return;
if (!has_transparent_hugepage())
@@ -730,65 +746,73 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args)
WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
}
-static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args)
+static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args)
{
pmd_t pmd;
- if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) ||
- !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
+ if (!pgtable_supports_soft_dirty() ||
+ !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
return;
if (!has_transparent_hugepage())
return;
pr_debug("Validating PMD swap soft dirty\n");
- pmd = swp_entry_to_pmd(args->swp_entry);
- WARN_ON(!is_swap_pmd(pmd));
+ pmd = swp_entry_to_pmd(args->leaf_entry);
+ WARN_ON(!pmd_is_huge(pmd));
+ WARN_ON(!pmd_is_valid_softleaf(pmd));
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { }
-static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_leaf_soft_dirty_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
{
- swp_entry_t entry, entry2;
+ swp_entry_t entry;
+ softleaf_t softleaf;
pte_t pte;
pr_debug("Validating PTE swap exclusive\n");
entry = args->swp_entry;
pte = swp_entry_to_pte(entry);
+ softleaf = softleaf_from_pte(pte);
+
WARN_ON(pte_swp_exclusive(pte));
- WARN_ON(!is_swap_pte(pte));
- entry2 = pte_to_swp_entry(pte);
- WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+ WARN_ON(!softleaf_is_swap(softleaf));
+ WARN_ON(memcmp(&entry, &softleaf, sizeof(entry)));
pte = pte_swp_mkexclusive(pte);
+ softleaf = softleaf_from_pte(pte);
+
WARN_ON(!pte_swp_exclusive(pte));
- WARN_ON(!is_swap_pte(pte));
+ WARN_ON(!softleaf_is_swap(softleaf));
WARN_ON(pte_swp_soft_dirty(pte));
- entry2 = pte_to_swp_entry(pte);
- WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+ WARN_ON(memcmp(&entry, &softleaf, sizeof(entry)));
pte = pte_swp_clear_exclusive(pte);
+ softleaf = softleaf_from_pte(pte);
+
WARN_ON(pte_swp_exclusive(pte));
- WARN_ON(!is_swap_pte(pte));
- entry2 = pte_to_swp_entry(pte);
- WARN_ON(memcmp(&entry, &entry2, sizeof(entry)));
+ WARN_ON(!softleaf_is_swap(softleaf));
+ WARN_ON(memcmp(&entry, &softleaf, sizeof(entry)));
}
static void __init pte_swap_tests(struct pgtable_debug_args *args)
{
swp_entry_t arch_entry;
+ softleaf_t entry;
pte_t pte1, pte2;
pr_debug("Validating PTE swap\n");
pte1 = swp_entry_to_pte(args->swp_entry);
- WARN_ON(!is_swap_pte(pte1));
+ entry = softleaf_from_pte(pte1);
+
+ WARN_ON(!softleaf_is_swap(entry));
arch_entry = __pte_to_swp_entry(pte1);
pte2 = __swp_entry_to_pte(arch_entry);
@@ -796,7 +820,7 @@ static void __init pte_swap_tests(struct pgtable_debug_args *args)
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-static void __init pmd_swap_tests(struct pgtable_debug_args *args)
+static void __init pmd_softleaf_tests(struct pgtable_debug_args *args)
{
swp_entry_t arch_entry;
pmd_t pmd1, pmd2;
@@ -805,21 +829,22 @@ static void __init pmd_swap_tests(struct pgtable_debug_args *args)
return;
pr_debug("Validating PMD swap\n");
- pmd1 = swp_entry_to_pmd(args->swp_entry);
- WARN_ON(!is_swap_pmd(pmd1));
+ pmd1 = swp_entry_to_pmd(args->leaf_entry);
+ WARN_ON(!pmd_is_huge(pmd1));
+ WARN_ON(!pmd_is_valid_softleaf(pmd1));
arch_entry = __pmd_to_swp_entry(pmd1);
pmd2 = __swp_entry_to_pmd(arch_entry);
WARN_ON(memcmp(&pmd1, &pmd2, sizeof(pmd1)));
}
#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
-static void __init pmd_swap_tests(struct pgtable_debug_args *args) { }
+static void __init pmd_softleaf_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
static void __init swap_migration_tests(struct pgtable_debug_args *args)
{
struct page *page;
- swp_entry_t swp;
+ softleaf_t entry;
if (!IS_ENABLED(CONFIG_MIGRATION))
return;
@@ -842,17 +867,17 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args)
* be locked, otherwise it stumbles upon a BUG_ON().
*/
__SetPageLocked(page);
- swp = make_writable_migration_entry(page_to_pfn(page));
- WARN_ON(!is_migration_entry(swp));
- WARN_ON(!is_writable_migration_entry(swp));
+ entry = make_writable_migration_entry(page_to_pfn(page));
+ WARN_ON(!softleaf_is_migration(entry));
+ WARN_ON(!softleaf_is_migration_write(entry));
- swp = make_readable_migration_entry(swp_offset(swp));
- WARN_ON(!is_migration_entry(swp));
- WARN_ON(is_writable_migration_entry(swp));
+ entry = make_readable_migration_entry(swp_offset(entry));
+ WARN_ON(!softleaf_is_migration(entry));
+ WARN_ON(softleaf_is_migration_write(entry));
- swp = make_readable_migration_entry(page_to_pfn(page));
- WARN_ON(!is_migration_entry(swp));
- WARN_ON(is_writable_migration_entry(swp));
+ entry = make_readable_migration_entry(page_to_pfn(page));
+ WARN_ON(!softleaf_is_migration(entry));
+ WARN_ON(softleaf_is_migration_write(entry));
__ClearPageLocked(page);
}
@@ -1204,9 +1229,11 @@ static int __init init_args(struct pgtable_debug_args *args)
init_fixed_pfns(args);
/* See generic_max_swapfile_size(): probe the maximum offset */
- max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL))));
- /* Create a swp entry with all possible bits set */
- args->swp_entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset);
+ max_swap_offset = swp_offset(softleaf_from_pte(softleaf_to_pte(swp_entry(0, ~0UL))));
+ /* Create a swp entry with all possible bits set while still being swap. */
+ args->swp_entry = swp_entry(MAX_SWAPFILES - 1, max_swap_offset);
+ /* Create a non-present migration entry. */
+ args->leaf_entry = make_writable_migration_entry(~0UL);
/*
* Allocate (huge) pages because some of the tests need to access
@@ -1296,12 +1323,12 @@ static int __init debug_vm_pgtable(void)
pte_soft_dirty_tests(&args);
pmd_soft_dirty_tests(&args);
pte_swap_soft_dirty_tests(&args);
- pmd_swap_soft_dirty_tests(&args);
+ pmd_leaf_soft_dirty_tests(&args);
pte_swap_exclusive_tests(&args);
pte_swap_tests(&args);
- pmd_swap_tests(&args);
+ pmd_softleaf_tests(&args);
swap_migration_tests(&args);
diff --git a/mm/filemap.c b/mm/filemap.c
index dfc8a31f1222..1bc81e561aa0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -21,7 +21,7 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -48,7 +48,8 @@
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
#include <linux/sysctl.h>
-#include <asm/pgalloc.h>
+#include <linux/pgalloc.h>
+
#include <asm/tlbflush.h>
#include "internal.h"
@@ -181,13 +182,13 @@ static void filemap_unaccount_folio(struct address_space *mapping,
nr = folio_nr_pages(folio);
- __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+ lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
if (folio_test_swapbacked(folio)) {
- __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
+ lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
if (folio_test_pmd_mappable(folio))
- __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
+ lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
} else if (folio_test_pmd_mappable(folio)) {
- __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
+ lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
}
if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags))
@@ -830,13 +831,13 @@ void replace_page_cache_folio(struct folio *old, struct folio *new)
old->mapping = NULL;
/* hugetlb pages do not participate in page cache accounting. */
if (!folio_test_hugetlb(old))
- __lruvec_stat_sub_folio(old, NR_FILE_PAGES);
+ lruvec_stat_sub_folio(old, NR_FILE_PAGES);
if (!folio_test_hugetlb(new))
- __lruvec_stat_add_folio(new, NR_FILE_PAGES);
+ lruvec_stat_add_folio(new, NR_FILE_PAGES);
if (folio_test_swapbacked(old))
- __lruvec_stat_sub_folio(old, NR_SHMEM);
+ lruvec_stat_sub_folio(old, NR_SHMEM);
if (folio_test_swapbacked(new))
- __lruvec_stat_add_folio(new, NR_SHMEM);
+ lruvec_stat_add_folio(new, NR_SHMEM);
xas_unlock_irq(&xas);
if (free_folio)
free_folio(old);
@@ -919,9 +920,9 @@ noinline int __filemap_add_folio(struct address_space *mapping,
/* hugetlb pages do not participate in page cache accounting */
if (!huge) {
- __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+ lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
if (folio_test_pmd_mappable(folio))
- __lruvec_stat_mod_folio(folio,
+ lruvec_stat_mod_folio(folio,
NR_FILE_THPS, nr);
}
@@ -1388,7 +1389,7 @@ repeat:
* This follows the same logic as folio_wait_bit_common() so see the comments
* there.
*/
-void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
+void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
__releases(ptl)
{
struct wait_page_queue wait_page;
@@ -1397,7 +1398,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
unsigned long pflags;
bool in_thrashing;
wait_queue_head_t *q;
- struct folio *folio = pfn_swap_entry_folio(entry);
+ struct folio *folio = softleaf_to_folio(entry);
q = folio_waitqueue(folio);
if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
@@ -3298,11 +3299,47 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
struct file *fpin = NULL;
vm_flags_t vm_flags = vmf->vma->vm_flags;
+ bool force_thp_readahead = false;
unsigned short mmap_miss;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
- if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER)
+ force_thp_readahead = true;
+
+ if (!force_thp_readahead) {
+ /*
+ * If we don't want any read-ahead, don't bother.
+ * VM_EXEC case below is already intended for random access.
+ */
+ if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)
+ return fpin;
+
+ if (!ra->ra_pages)
+ return fpin;
+
+ if (vm_flags & VM_SEQ_READ) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ page_cache_sync_ra(&ractl, ra->ra_pages);
+ return fpin;
+ }
+ }
+
+ if (!(vm_flags & VM_SEQ_READ)) {
+ /* Avoid banging the cache line if not needed */
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss < MMAP_LOTSAMISS * 10)
+ WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
+
+ /*
+ * Do we miss much more than hit in this file? If so,
+ * stop bothering with read-ahead. It will only hurt.
+ */
+ if (mmap_miss > MMAP_LOTSAMISS)
+ return fpin;
+ }
+
+ if (force_thp_readahead) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
ra->size = HPAGE_PMD_NR;
@@ -3317,34 +3354,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
page_cache_ra_order(&ractl, ra);
return fpin;
}
-#endif
-
- /*
- * If we don't want any read-ahead, don't bother. VM_EXEC case below is
- * already intended for random access.
- */
- if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)
- return fpin;
- if (!ra->ra_pages)
- return fpin;
-
- if (vm_flags & VM_SEQ_READ) {
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_ra(&ractl, ra->ra_pages);
- return fpin;
- }
-
- /* Avoid banging the cache line if not needed */
- mmap_miss = READ_ONCE(ra->mmap_miss);
- if (mmap_miss < MMAP_LOTSAMISS * 10)
- WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
-
- /*
- * Do we miss much more than hit in this file? If so,
- * stop bothering with read-ahead. It will only hurt.
- */
- if (mmap_miss > MMAP_LOTSAMISS)
- return fpin;
if (vm_flags & VM_EXEC) {
/*
@@ -4595,7 +4604,7 @@ static void filemap_cachestat(struct address_space *mapping,
swp_entry_t swp = radix_to_swp_entry(folio);
/* swapin error results in poisoned entry */
- if (non_swap_entry(swp))
+ if (!softleaf_is_swap(swp))
goto resched;
/*
diff --git a/mm/gup.c b/mm/gup.c
index a8ba5112e4d0..95d948c8e86c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -950,7 +950,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
pudp = pud_offset(p4dp, address);
- pud = READ_ONCE(*pudp);
+ pud = pudp_get(pudp);
if (!pud_present(pud))
return no_page_table(vma, flags, address);
if (pud_leaf(pud)) {
@@ -975,7 +975,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
p4d_t *p4dp, p4d;
p4dp = p4d_offset(pgdp, address);
- p4d = READ_ONCE(*p4dp);
+ p4d = p4dp_get(p4dp);
BUILD_BUG_ON(p4d_leaf(p4d));
if (!p4d_present(p4d) || p4d_bad(p4d))
@@ -2710,7 +2710,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*
* *) ptes can be read atomically by the architecture.
*
- * *) valid user addesses are below TASK_MAX_SIZE
+ * *) valid user addresses are below TASK_MAX_SIZE
*
* The last two assumptions can be relaxed by the addition of helper functions.
*
@@ -3060,7 +3060,7 @@ static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
pudp = pud_offset_lockless(p4dp, p4d, addr);
do {
- pud_t pud = READ_ONCE(*pudp);
+ pud_t pud = pudp_get(pudp);
next = pud_addr_end(addr, end);
if (unlikely(!pud_present(pud)))
@@ -3086,7 +3086,7 @@ static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
p4dp = p4d_offset_lockless(pgdp, pgd, addr);
do {
- p4d_t p4d = READ_ONCE(*p4dp);
+ p4d_t p4d = p4dp_get(p4dp);
next = p4d_addr_end(addr, end);
if (!p4d_present(p4d))
@@ -3108,7 +3108,7 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
pgdp = pgd_offset(current->mm, addr);
do {
- pgd_t pgd = READ_ONCE(*pgdp);
+ pgd_t pgd = pgdp_get(pgdp);
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
diff --git a/mm/hmm.c b/mm/hmm.c
index 87562914670a..4ec74c18bef6 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -18,7 +18,7 @@
#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/pagemap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/hugetlb.h>
#include <linux/memremap.h>
#include <linux/sched/mm.h>
@@ -244,7 +244,12 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
uint64_t pfn_req_flags = *hmm_pfn;
uint64_t new_pfn_flags = 0;
- if (pte_none_mostly(pte)) {
+ /*
+ * Any other marker than a UFFD WP marker will result in a fault error
+ * that will be correctly handled, so we need only check for UFFD WP
+ * here.
+ */
+ if (pte_none(pte) || pte_is_uffd_wp_marker(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
@@ -253,19 +258,19 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
}
if (!pte_present(pte)) {
- swp_entry_t entry = pte_to_swp_entry(pte);
+ const softleaf_t entry = softleaf_from_pte(pte);
/*
* Don't fault in device private pages owned by the caller,
* just report the PFN.
*/
- if (is_device_private_entry(entry) &&
- page_pgmap(pfn_swap_entry_to_page(entry))->owner ==
+ if (softleaf_is_device_private(entry) &&
+ page_pgmap(softleaf_to_page(entry))->owner ==
range->dev_private_owner) {
cpu_flags = HMM_PFN_VALID;
- if (is_writable_device_private_entry(entry))
+ if (softleaf_is_device_private_write(entry))
cpu_flags |= HMM_PFN_WRITE;
- new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
+ new_pfn_flags = softleaf_to_pfn(entry) | cpu_flags;
goto out;
}
@@ -274,16 +279,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
if (!required_fault)
goto out;
- if (!non_swap_entry(entry))
+ if (softleaf_is_swap(entry))
goto fault;
- if (is_device_private_entry(entry))
+ if (softleaf_is_device_private(entry))
goto fault;
- if (is_device_exclusive_entry(entry))
+ if (softleaf_is_device_exclusive(entry))
goto fault;
- if (is_migration_entry(entry)) {
+ if (softleaf_is_migration(entry)) {
pte_unmap(ptep);
hmm_vma_walk->last = addr;
migration_entry_wait(walk->mm, pmdp, addr);
@@ -334,19 +339,19 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long npages = (end - start) >> PAGE_SHIFT;
+ const softleaf_t entry = softleaf_from_pmd(pmd);
unsigned long addr = start;
- swp_entry_t entry = pmd_to_swp_entry(pmd);
unsigned int required_fault;
- if (is_device_private_entry(entry) &&
- pfn_swap_entry_folio(entry)->pgmap->owner ==
+ if (softleaf_is_device_private(entry) &&
+ softleaf_to_folio(entry)->pgmap->owner ==
range->dev_private_owner) {
unsigned long cpu_flags = HMM_PFN_VALID |
hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
- unsigned long pfn = swp_offset_pfn(entry);
+ unsigned long pfn = softleaf_to_pfn(entry);
unsigned long i;
- if (is_writable_device_private_entry(entry))
+ if (softleaf_is_device_private_write(entry))
cpu_flags |= HMM_PFN_WRITE;
/*
@@ -365,7 +370,7 @@ static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start,
required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
npages, 0);
if (required_fault) {
- if (is_device_private_entry(entry))
+ if (softleaf_is_device_private(entry))
return hmm_vma_fault(addr, end, required_fault, walk);
else
return -EFAULT;
@@ -407,7 +412,7 @@ again:
if (pmd_none(pmd))
return hmm_vma_walk_hole(start, end, -1, walk);
- if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
+ if (thp_migration_supported() && pmd_is_migration_entry(pmd)) {
if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
hmm_vma_walk->last = addr;
pmd_migration_entry_wait(walk->mm, pmdp);
@@ -491,7 +496,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
/* Normally we don't want to split the huge page */
walk->action = ACTION_CONTINUE;
- pud = READ_ONCE(*pudp);
+ pud = pudp_get(pudp);
if (!pud_present(pud)) {
spin_unlock(ptl);
return hmm_vma_walk_hole(start, end, -1, walk);
@@ -811,7 +816,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
break;
case PCI_P2PDMA_MAP_BUS_ADDR:
pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
- return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ return pci_p2pdma_bus_addr_map(p2pdma_state->mem, paddr);
default:
return DMA_MAPPING_ERROR;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1192e62531cd..f7c565f11a98 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -37,11 +37,11 @@
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/compat.h>
+#include <linux/pgalloc.h>
#include <linux/pgalloc_tag.h>
#include <linux/pagewalk.h>
#include <asm/tlb.h>
-#include <asm/pgalloc.h>
#include "internal.h"
#include "swap.h"
@@ -1077,28 +1077,103 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
return pmd;
}
+static struct deferred_split *split_queue_node(int nid)
+{
+ struct pglist_data *pgdata = NODE_DATA(nid);
+
+ return &pgdata->deferred_split_queue;
+}
+
#ifdef CONFIG_MEMCG
static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
+struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+ struct deferred_split *queue)
{
- struct mem_cgroup *memcg = folio_memcg(folio);
- struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
+ if (mem_cgroup_disabled())
+ return NULL;
+ if (split_queue_node(folio_nid(folio)) == queue)
+ return NULL;
+ return container_of(queue, struct mem_cgroup, deferred_split_queue);
+}
- if (memcg)
- return &memcg->deferred_split_queue;
- else
- return &pgdat->deferred_split_queue;
+static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
+{
+ return memcg ? &memcg->deferred_split_queue : split_queue_node(nid);
}
#else
static inline
-struct deferred_split *get_deferred_split_queue(struct folio *folio)
+struct mem_cgroup *folio_split_queue_memcg(struct folio *folio,
+ struct deferred_split *queue)
{
- struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
+ return NULL;
+}
- return &pgdat->deferred_split_queue;
+static struct deferred_split *memcg_split_queue(int nid, struct mem_cgroup *memcg)
+{
+ return split_queue_node(nid);
}
#endif
+static struct deferred_split *split_queue_lock(int nid, struct mem_cgroup *memcg)
+{
+ struct deferred_split *queue;
+
+retry:
+ queue = memcg_split_queue(nid, memcg);
+ spin_lock(&queue->split_queue_lock);
+ /*
+ * There is a period between setting memcg to dying and reparenting
+ * deferred split queue, and during this period the THPs in the deferred
+ * split queue will be hidden from the shrinker side.
+ */
+ if (unlikely(memcg_is_dying(memcg))) {
+ spin_unlock(&queue->split_queue_lock);
+ memcg = parent_mem_cgroup(memcg);
+ goto retry;
+ }
+
+ return queue;
+}
+
+static struct deferred_split *
+split_queue_lock_irqsave(int nid, struct mem_cgroup *memcg, unsigned long *flags)
+{
+ struct deferred_split *queue;
+
+retry:
+ queue = memcg_split_queue(nid, memcg);
+ spin_lock_irqsave(&queue->split_queue_lock, *flags);
+ if (unlikely(memcg_is_dying(memcg))) {
+ spin_unlock_irqrestore(&queue->split_queue_lock, *flags);
+ memcg = parent_mem_cgroup(memcg);
+ goto retry;
+ }
+
+ return queue;
+}
+
+static struct deferred_split *folio_split_queue_lock(struct folio *folio)
+{
+ return split_queue_lock(folio_nid(folio), folio_memcg(folio));
+}
+
+static struct deferred_split *
+folio_split_queue_lock_irqsave(struct folio *folio, unsigned long *flags)
+{
+ return split_queue_lock_irqsave(folio_nid(folio), folio_memcg(folio), flags);
+}
+
+static inline void split_queue_unlock(struct deferred_split *queue)
+{
+ spin_unlock(&queue->split_queue_lock);
+}
+
+static inline void split_queue_unlock_irqrestore(struct deferred_split *queue,
+ unsigned long flags)
+{
+ spin_unlock_irqrestore(&queue->split_queue_lock, flags);
+}
+
static inline bool is_transparent_hugepage(const struct folio *folio)
{
if (!folio_test_large(folio))
@@ -1127,7 +1202,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
if (len_pad < len || (off + len_pad) < off)
return 0;
- ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
+ ret = mm_get_unmapped_area_vmflags(filp, addr, len_pad,
off >> PAGE_SHIFT, flags, vm_flags);
/*
@@ -1164,7 +1239,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
if (ret)
return ret;
- return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
+ return mm_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags,
vm_flags);
}
@@ -1218,7 +1293,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
return folio;
}
-static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
+void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd,
struct vm_area_struct *vma, unsigned long haddr)
{
pmd_t entry;
@@ -1229,6 +1304,13 @@ static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
folio_add_lru_vma(folio, vma);
set_pmd_at(vma->vm_mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, haddr, pmd);
+ deferred_split_folio(folio, false);
+}
+
+static void map_anon_folio_pmd_pf(struct folio *folio, pmd_t *pmd,
+ struct vm_area_struct *vma, unsigned long haddr)
+{
+ map_anon_folio_pmd_nopf(folio, pmd, vma, haddr);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
count_vm_event(THP_FAULT_ALLOC);
count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
@@ -1271,9 +1353,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return ret;
}
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
+ map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr);
mm_inc_nr_ptes(vma->vm_mm);
- deferred_split_folio(folio, false);
spin_unlock(vmf->ptl);
}
@@ -1288,6 +1369,44 @@ release:
}
+vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = 0;
+ spinlock_t *ptl;
+ softleaf_t entry;
+ struct page *page;
+ struct folio *folio;
+
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+
+ ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ entry = softleaf_from_pmd(vmf->orig_pmd);
+ page = softleaf_to_page(entry);
+ folio = page_folio(page);
+ vmf->page = page;
+ vmf->pte = NULL;
+ if (folio_trylock(folio)) {
+ folio_get(folio);
+ spin_unlock(ptl);
+ ret = page_pgmap(page)->ops->migrate_to_ram(vmf);
+ folio_unlock(folio);
+ folio_put(folio);
+ } else {
+ spin_unlock(ptl);
+ }
+
+ return ret;
+}
+
/*
* always: directly stall for all thp allocations
* defer: wake kswapd and fail if not immediately available
@@ -1668,6 +1787,62 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
return false;
}
+static void copy_huge_non_present_pmd(
+ struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pmd_t pmd, pgtable_t pgtable)
+{
+ softleaf_t entry = softleaf_from_pmd(pmd);
+ struct folio *src_folio;
+
+ VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(pmd));
+
+ if (softleaf_is_migration_write(entry) ||
+ softleaf_is_migration_read_exclusive(entry)) {
+ entry = make_readable_migration_entry(swp_offset(entry));
+ pmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_soft_dirty(*src_pmd))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ } else if (softleaf_is_device_private(entry)) {
+ /*
+ * For device private entries, since there are no
+ * read exclusive entries, writable = !readable
+ */
+ if (softleaf_is_device_private_write(entry)) {
+ entry = make_readable_device_private_entry(swp_offset(entry));
+ pmd = swp_entry_to_pmd(entry);
+
+ if (pmd_swp_soft_dirty(*src_pmd))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
+ set_pmd_at(src_mm, addr, src_pmd, pmd);
+ }
+
+ src_folio = softleaf_to_folio(entry);
+ VM_WARN_ON(!folio_test_large(src_folio));
+
+ folio_get(src_folio);
+ /*
+ * folio_try_dup_anon_rmap_pmd does not fail for
+ * device private entries.
+ */
+ folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page,
+ dst_vma, src_vma);
+ }
+
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ mm_inc_nr_ptes(dst_mm);
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+}
+
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
@@ -1713,31 +1888,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- if (unlikely(is_swap_pmd(pmd))) {
- swp_entry_t entry = pmd_to_swp_entry(pmd);
-
- VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (!is_readable_migration_entry(entry)) {
- entry = make_readable_migration_entry(
- swp_offset(entry));
- pmd = swp_entry_to_pmd(entry);
- if (pmd_swp_soft_dirty(*src_pmd))
- pmd = pmd_swp_mksoft_dirty(pmd);
- if (pmd_swp_uffd_wp(*src_pmd))
- pmd = pmd_swp_mkuffd_wp(pmd);
- set_pmd_at(src_mm, addr, src_pmd, pmd);
- }
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- mm_inc_nr_ptes(dst_mm);
- pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
- if (!userfaultfd_wp(dst_vma))
- pmd = pmd_swp_clear_uffd_wp(pmd);
- set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ if (unlikely(thp_migration_supported() &&
+ pmd_is_valid_softleaf(pmd))) {
+ copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr,
+ dst_vma, src_vma, pmd, pgtable);
ret = 0;
goto out_unlock;
}
-#endif
if (unlikely(!pmd_trans_huge(pmd))) {
pte_free(dst_mm, pgtable);
@@ -1887,7 +2044,7 @@ static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
if (ret)
goto release;
(void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
- map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
+ map_anon_folio_pmd_pf(folio, vmf->pmd, vma, haddr);
goto unlock;
release:
folio_put(folio);
@@ -2123,7 +2280,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (unlikely(!pmd_present(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
+ !pmd_is_migration_entry(orig_pmd));
goto out;
}
@@ -2221,15 +2378,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
folio_remove_rmap_pmd(folio, page, vma);
WARN_ON_ONCE(folio_mapcount(folio) < 0);
VM_BUG_ON_PAGE(!PageHead(page), page);
- } else if (thp_migration_supported()) {
- swp_entry_t entry;
+ } else if (pmd_is_valid_softleaf(orig_pmd)) {
+ const softleaf_t entry = softleaf_from_pmd(orig_pmd);
- VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
- entry = pmd_to_swp_entry(orig_pmd);
- folio = pfn_swap_entry_folio(entry);
+ folio = softleaf_to_folio(entry);
flush_needed = 0;
- } else
- WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+
+ if (!thp_migration_supported())
+ WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+ }
if (folio_test_anon(folio)) {
zap_deposited_table(tlb->mm, pmd);
@@ -2249,6 +2406,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
folio_mark_accessed(folio);
}
+ if (folio_is_device_private(folio)) {
+ folio_remove_rmap_pmd(folio, &folio->page, vma);
+ WARN_ON_ONCE(folio_mapcount(folio) < 0);
+ folio_put(folio);
+ }
+
spin_unlock(ptl);
if (flush_needed)
tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
@@ -2273,20 +2436,23 @@ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
-#ifdef CONFIG_MEM_SOFT_DIRTY
- if (unlikely(is_pmd_migration_entry(pmd)))
- pmd = pmd_swp_mksoft_dirty(pmd);
- else if (pmd_present(pmd))
- pmd = pmd_mksoft_dirty(pmd);
-#endif
+ if (pgtable_supports_soft_dirty()) {
+ if (unlikely(pmd_is_migration_entry(pmd)))
+ pmd = pmd_swp_mksoft_dirty(pmd);
+ else if (pmd_present(pmd))
+ pmd = pmd_mksoft_dirty(pmd);
+ }
+
return pmd;
}
static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
{
+ if (pmd_none(pmd))
+ return pmd;
if (pmd_present(pmd))
pmd = pmd_clear_uffd_wp(pmd);
- else if (is_swap_pmd(pmd))
+ else
pmd = pmd_swp_clear_uffd_wp(pmd);
return pmd;
@@ -2343,6 +2509,42 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
return false;
}
+static void change_non_present_huge_pmd(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmd, bool uffd_wp,
+ bool uffd_wp_resolve)
+{
+ softleaf_t entry = softleaf_from_pmd(*pmd);
+ const struct folio *folio = softleaf_to_folio(entry);
+ pmd_t newpmd;
+
+ VM_WARN_ON(!pmd_is_valid_softleaf(*pmd));
+ if (softleaf_is_migration_write(entry)) {
+ /*
+ * A protection check is difficult so
+ * just be safe and disable write
+ */
+ if (folio_test_anon(folio))
+ entry = make_readable_exclusive_migration_entry(swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
+ newpmd = swp_entry_to_pmd(entry);
+ if (pmd_swp_soft_dirty(*pmd))
+ newpmd = pmd_swp_mksoft_dirty(newpmd);
+ } else if (softleaf_is_device_private_write(entry)) {
+ entry = make_readable_device_private_entry(swp_offset(entry));
+ newpmd = swp_entry_to_pmd(entry);
+ } else {
+ newpmd = *pmd;
+ }
+
+ if (uffd_wp)
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
+ else if (uffd_wp_resolve)
+ newpmd = pmd_swp_clear_uffd_wp(newpmd);
+ if (!pmd_same(*pmd, newpmd))
+ set_pmd_at(mm, addr, pmd, newpmd);
+}
+
/*
* Returns
* - 0 if PMD could not be locked
@@ -2371,42 +2573,14 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (!ptl)
return 0;
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- if (is_swap_pmd(*pmd)) {
- swp_entry_t entry = pmd_to_swp_entry(*pmd);
- struct folio *folio = pfn_swap_entry_folio(entry);
- pmd_t newpmd;
-
- VM_BUG_ON(!is_pmd_migration_entry(*pmd));
- if (is_writable_migration_entry(entry)) {
- /*
- * A protection check is difficult so
- * just be safe and disable write
- */
- if (folio_test_anon(folio))
- entry = make_readable_exclusive_migration_entry(swp_offset(entry));
- else
- entry = make_readable_migration_entry(swp_offset(entry));
- newpmd = swp_entry_to_pmd(entry);
- if (pmd_swp_soft_dirty(*pmd))
- newpmd = pmd_swp_mksoft_dirty(newpmd);
- } else {
- newpmd = *pmd;
- }
-
- if (uffd_wp)
- newpmd = pmd_swp_mkuffd_wp(newpmd);
- else if (uffd_wp_resolve)
- newpmd = pmd_swp_clear_uffd_wp(newpmd);
- if (!pmd_same(*pmd, newpmd))
- set_pmd_at(mm, addr, pmd, newpmd);
+ if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) {
+ change_non_present_huge_pmd(mm, addr, pmd, uffd_wp,
+ uffd_wp_resolve);
goto unlock;
}
-#endif
if (prot_numa) {
- struct folio *folio;
- bool toptier;
+
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
@@ -2418,19 +2592,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (pmd_protnone(*pmd))
goto unlock;
- folio = pmd_folio(*pmd);
- toptier = node_is_toptier(folio_nid(folio));
- /*
- * Skip scanning top tier node if normal numa
- * balancing is disabled
- */
- if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- toptier)
+ if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
+ vma_is_single_threaded_private(vma)))
goto unlock;
-
- if (folio_use_access_time(folio))
- folio_xchg_access_time(folio,
- jiffies_to_msecs(jiffies));
}
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
@@ -2543,7 +2707,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
pmd_t _dst_pmd, src_pmdval;
struct page *src_page;
struct folio *src_folio;
- struct anon_vma *src_anon_vma;
spinlock_t *src_ptl, *dst_ptl;
pgtable_t src_pgtable;
struct mmu_notifier_range range;
@@ -2565,7 +2728,7 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
if (!pmd_trans_huge(src_pmdval)) {
spin_unlock(src_ptl);
- if (is_pmd_migration_entry(src_pmdval)) {
+ if (pmd_is_migration_entry(src_pmdval)) {
pmd_migration_entry_wait(mm, &src_pmdval);
return -EAGAIN;
}
@@ -2592,23 +2755,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
src_addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
- if (src_folio) {
+ if (src_folio)
folio_lock(src_folio);
- /*
- * split_huge_page walks the anon_vma chain without the page
- * lock. Serialize against it with the anon_vma lock, the page
- * lock is not enough.
- */
- src_anon_vma = folio_get_anon_vma(src_folio);
- if (!src_anon_vma) {
- err = -EAGAIN;
- goto unlock_folio;
- }
- anon_vma_lock_write(src_anon_vma);
- } else
- src_anon_vma = NULL;
-
dst_ptl = pmd_lockptr(mm, dst_pmd);
double_pt_lock(src_ptl, dst_ptl);
if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
@@ -2653,11 +2802,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
unlock_ptls:
double_pt_unlock(src_ptl, dst_ptl);
- if (src_anon_vma) {
- anon_vma_unlock_write(src_anon_vma);
- put_anon_vma(src_anon_vma);
- }
-unlock_folio:
/* unblock rmap walks */
if (src_folio)
folio_unlock(src_folio);
@@ -2677,8 +2821,9 @@ unlock_folio:
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
spinlock_t *ptl;
+
ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)))
+ if (likely(pmd_is_huge(*pmd)))
return ptl;
spin_unlock(ptl);
return NULL;
@@ -2844,7 +2989,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page;
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
- bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
+ bool soft_dirty, uffd_wp = false, young = false, write = false;
bool anon_exclusive = false, dirty = false;
unsigned long addr;
pte_t *pte;
@@ -2853,7 +2998,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
+
+ VM_WARN_ON_ONCE(!pmd_is_valid_softleaf(*pmd) && !pmd_trans_huge(*pmd));
count_vm_event(THP_SPLIT_PMD);
@@ -2867,11 +3013,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
zap_deposited_table(mm, pmd);
if (!vma_is_dax(vma) && vma_is_special_huge(vma))
return;
- if (unlikely(is_pmd_migration_entry(old_pmd))) {
- swp_entry_t entry;
+ if (unlikely(pmd_is_migration_entry(old_pmd))) {
+ const softleaf_t old_entry = softleaf_from_pmd(old_pmd);
- entry = pmd_to_swp_entry(old_pmd);
- folio = pfn_swap_entry_folio(entry);
+ folio = softleaf_to_folio(old_entry);
} else if (is_huge_zero_pmd(old_pmd)) {
return;
} else {
@@ -2901,20 +3046,54 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- pmd_migration = is_pmd_migration_entry(*pmd);
- if (unlikely(pmd_migration)) {
- swp_entry_t entry;
+ if (pmd_is_migration_entry(*pmd)) {
+ softleaf_t entry;
old_pmd = *pmd;
- entry = pmd_to_swp_entry(old_pmd);
- page = pfn_swap_entry_to_page(entry);
- write = is_writable_migration_entry(entry);
+ entry = softleaf_from_pmd(old_pmd);
+ page = softleaf_to_page(entry);
+ folio = page_folio(page);
+
+ soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+ write = softleaf_is_migration_write(entry);
if (PageAnon(page))
- anon_exclusive = is_readable_exclusive_migration_entry(entry);
- young = is_migration_entry_young(entry);
- dirty = is_migration_entry_dirty(entry);
+ anon_exclusive = softleaf_is_migration_read_exclusive(entry);
+ young = softleaf_is_migration_young(entry);
+ dirty = softleaf_is_migration_dirty(entry);
+ } else if (pmd_is_device_private_entry(*pmd)) {
+ softleaf_t entry;
+
+ old_pmd = *pmd;
+ entry = softleaf_from_pmd(old_pmd);
+ page = softleaf_to_page(entry);
+ folio = page_folio(page);
+
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+ write = softleaf_is_device_private_write(entry);
+ anon_exclusive = PageAnonExclusive(page);
+
+ /*
+ * Device private THP should be treated the same as regular
+ * folios w.r.t anon exclusive handling. See the comments for
+ * folio handling and anon_exclusive below.
+ */
+ if (freeze && anon_exclusive &&
+ folio_try_share_anon_rmap_pmd(folio, page))
+ freeze = false;
+ if (!freeze) {
+ rmap_t rmap_flags = RMAP_NONE;
+
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
+ if (anon_exclusive)
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
+ vma, haddr, rmap_flags);
+ }
} else {
/*
* Up to this point the pmd is present and huge and userland has
@@ -2998,11 +3177,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* Note that NUMA hinting access restrictions are not transferred to
* avoid any possibility of altering permissions across VMAs.
*/
- if (freeze || pmd_migration) {
- for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
- pte_t entry;
- swp_entry_t swp_entry;
+ if (freeze || pmd_is_migration_entry(old_pmd)) {
+ pte_t entry;
+ swp_entry_t swp_entry;
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
if (write)
swp_entry = make_writable_migration_entry(
page_to_pfn(page + i));
@@ -3021,7 +3200,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_swp_mksoft_dirty(entry);
if (uffd_wp)
entry = pte_swp_mkuffd_wp(entry);
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+ set_pte_at(mm, addr, pte + i, entry);
+ }
+ } else if (pmd_is_device_private_entry(old_pmd)) {
+ pte_t entry;
+ swp_entry_t swp_entry;
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ /*
+ * anon_exclusive was already propagated to the relevant
+ * pages corresponding to the pte entries when freeze
+ * is false.
+ */
+ if (write)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page + i));
+ /*
+ * Young and dirty bits are not progated via swp_entry
+ */
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
VM_WARN_ON(!pte_none(ptep_get(pte + i)));
set_pte_at(mm, addr, pte + i, entry);
}
@@ -3048,7 +3253,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
pte_unmap(pte);
- if (!pmd_migration)
+ if (!pmd_is_migration_entry(*pmd))
folio_remove_rmap_pmd(folio, page, vma);
if (freeze)
put_page(page);
@@ -3061,7 +3266,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze)
{
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
- if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
+ if (pmd_trans_huge(*pmd) || pmd_is_valid_softleaf(*pmd))
__split_huge_pmd_locked(vma, pmd, address, freeze);
}
@@ -3240,6 +3445,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
lockdep_assert_held(&lruvec->lru_lock);
+ if (folio_is_device_private(folio))
+ return;
+
if (list) {
/* page reclaim is reclaiming a huge page */
VM_WARN_ON(folio_test_lru(folio));
@@ -3354,15 +3562,6 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
new_folio->mapping = folio->mapping;
new_folio->index = folio->index + i;
- /*
- * page->private should not be set in tail pages. Fix up and warn once
- * if private is unexpectedly set.
- */
- if (unlikely(new_folio->private)) {
- VM_WARN_ON_ONCE_PAGE(true, new_head);
- new_folio->private = NULL;
- }
-
if (folio_test_swapcache(folio))
new_folio->swap.val = folio->swap.val + i;
@@ -3398,8 +3597,9 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
ClearPageCompound(&folio->page);
}
-/*
- * It splits an unmapped @folio to lower order smaller folios in two ways.
+/**
+ * __split_unmapped_folio() - splits an unmapped @folio to lower order folios in
+ * two ways: uniform split or non-uniform split.
* @folio: the to-be-split folio
* @new_order: the smallest order of the after split folios (since buddy
* allocator like split generates folios with orders from @folio's
@@ -3408,64 +3608,56 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* will be split until its order becomes @new_order.
* @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
* @mapping: @folio->mapping
- * @uniform_split: if the split is uniform or not (buddy allocator like split)
+ * @split_type: if the split is uniform or not (buddy allocator like split)
*
*
* 1. uniform split: the given @folio into multiple @new_order small folios,
* where all small folios have the same order. This is done when
- * uniform_split is true.
+ * split_type is SPLIT_TYPE_UNIFORM.
* 2. buddy allocator like (non-uniform) split: the given @folio is split into
* half and one of the half (containing the given page) is split into half
- * until the given @page's order becomes @new_order. This is done when
- * uniform_split is false.
+ * until the given @folio's order becomes @new_order. This is done when
+ * split_type is SPLIT_TYPE_NON_UNIFORM.
*
* The high level flow for these two methods are:
- * 1. uniform split: a single __split_folio_to_order() is called to split the
- * @folio into @new_order, then we traverse all the resulting folios one by
- * one in PFN ascending order and perform stats, unfreeze, adding to list,
- * and file mapping index operations.
- * 2. non-uniform split: in general, folio_order - @new_order calls to
- * __split_folio_to_order() are made in a for loop to split the @folio
- * to one lower order at a time. The resulting small folios are processed
- * like what is done during the traversal in 1, except the one containing
- * @page, which is split in next for loop.
+ *
+ * 1. uniform split: @xas is split with no expectation of failure and a single
+ * __split_folio_to_order() is called to split the @folio into @new_order
+ * along with stats update.
+ * 2. non-uniform split: folio_order - @new_order calls to
+ * __split_folio_to_order() are expected to be made in a for loop to split
+ * the @folio to one lower order at a time. The folio containing @split_at
+ * is split in each iteration. @xas is split into half in each iteration and
+ * can fail. A failed @xas split leaves split folios as is without merging
+ * them back.
*
* After splitting, the caller's folio reference will be transferred to the
- * folio containing @page. The caller needs to unlock and/or free after-split
- * folios if necessary.
+ * folio containing @split_at. The caller needs to unlock and/or free
+ * after-split folios if necessary.
*
- * For !uniform_split, when -ENOMEM is returned, the original folio might be
- * split. The caller needs to check the input folio.
+ * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
*/
static int __split_unmapped_folio(struct folio *folio, int new_order,
struct page *split_at, struct xa_state *xas,
- struct address_space *mapping, bool uniform_split)
+ struct address_space *mapping, enum split_type split_type)
{
- int order = folio_order(folio);
- int start_order = uniform_split ? new_order : order - 1;
- bool stop_split = false;
- struct folio *next;
+ const bool is_anon = folio_test_anon(folio);
+ int old_order = folio_order(folio);
+ int start_order = split_type == SPLIT_TYPE_UNIFORM ? new_order : old_order - 1;
int split_order;
- int ret = 0;
-
- if (folio_test_anon(folio))
- mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
/*
* split to new_order one order at a time. For uniform split,
* folio is split to new_order directly.
*/
for (split_order = start_order;
- split_order >= new_order && !stop_split;
+ split_order >= new_order;
split_order--) {
- struct folio *end_folio = folio_next(folio);
- int old_order = folio_order(folio);
- struct folio *new_folio;
+ int nr_new_folios = 1UL << (old_order - split_order);
/* order-1 anonymous folio is not supported */
- if (folio_test_anon(folio) && split_order == 1)
- continue;
- if (uniform_split && split_order != new_order)
+ if (is_anon && split_order == 1)
continue;
if (mapping) {
@@ -3474,58 +3666,39 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
* irq is disabled to allocate enough memory, whereas
* non-uniform split can handle ENOMEM.
*/
- if (uniform_split)
+ if (split_type == SPLIT_TYPE_UNIFORM)
xas_split(xas, folio, old_order);
else {
xas_set_order(xas, folio->index, split_order);
xas_try_split(xas, folio, old_order);
- if (xas_error(xas)) {
- ret = xas_error(xas);
- stop_split = true;
- }
+ if (xas_error(xas))
+ return xas_error(xas);
}
}
- if (!stop_split) {
- folio_split_memcg_refs(folio, old_order, split_order);
- split_page_owner(&folio->page, old_order, split_order);
- pgalloc_tag_split(folio, old_order, split_order);
+ folio_split_memcg_refs(folio, old_order, split_order);
+ split_page_owner(&folio->page, old_order, split_order);
+ pgalloc_tag_split(folio, old_order, split_order);
+ __split_folio_to_order(folio, old_order, split_order);
- __split_folio_to_order(folio, old_order, split_order);
+ if (is_anon) {
+ mod_mthp_stat(old_order, MTHP_STAT_NR_ANON, -1);
+ mod_mthp_stat(split_order, MTHP_STAT_NR_ANON, nr_new_folios);
}
-
/*
- * Iterate through after-split folios and update folio stats.
- * But in buddy allocator like split, the folio
- * containing the specified page is skipped until its order
- * is new_order, since the folio will be worked on in next
- * iteration.
+ * If uniform split, the process is complete.
+ * If non-uniform, continue splitting the folio at @split_at
+ * as long as the next @split_order is >= @new_order.
*/
- for (new_folio = folio; new_folio != end_folio; new_folio = next) {
- next = folio_next(new_folio);
- /*
- * for buddy allocator like split, new_folio containing
- * @split_at page could be split again, thus do not
- * change stats yet. Wait until new_folio's order is
- * @new_order or stop_split is set to true by the above
- * xas_split() failure.
- */
- if (new_folio == page_folio(split_at)) {
- folio = new_folio;
- if (split_order != new_order && !stop_split)
- continue;
- }
- if (folio_test_anon(new_folio))
- mod_mthp_stat(folio_order(new_folio),
- MTHP_STAT_NR_ANON, 1);
- }
+ folio = page_folio(split_at);
+ old_order = split_order;
}
- return ret;
+ return 0;
}
-bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
- bool warns)
+bool folio_split_supported(struct folio *folio, unsigned int new_order,
+ enum split_type split_type, bool warns)
{
if (folio_test_anon(folio)) {
/* order-1 is not supported for anonymous THP. */
@@ -3533,21 +3706,41 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
"Cannot split to order-1 folio");
if (new_order == 1)
return false;
- } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- !mapping_large_folio_support(folio->mapping)) {
- /*
- * No split if the file system does not support large folio.
- * Note that we might still have THPs in such mappings due to
- * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
- * does not actually support large folios properly.
- */
- VM_WARN_ONCE(warns,
- "Cannot split file folio to non-0 order");
- return false;
+ } else if (split_type == SPLIT_TYPE_NON_UNIFORM || new_order) {
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ !mapping_large_folio_support(folio->mapping)) {
+ /*
+ * We can always split a folio down to a single page
+ * (new_order == 0) uniformly.
+ *
+ * For any other scenario
+ * a) uniform split targeting a large folio
+ * (new_order > 0)
+ * b) any non-uniform split
+ * we must confirm that the file system supports large
+ * folios.
+ *
+ * Note that we might still have THPs in such
+ * mappings, which is created from khugepaged when
+ * CONFIG_READ_ONLY_THP_FOR_FS is enabled. But in that
+ * case, the mapping does not actually support large
+ * folios properly.
+ */
+ VM_WARN_ONCE(warns,
+ "Cannot split file folio to non-0 order");
+ return false;
+ }
}
- /* Only swapping a whole PMD-mapped folio is supported */
- if (folio_test_swapcache(folio)) {
+ /*
+ * swapcache folio could only be split to order 0
+ *
+ * non-uniform split creates after-split folios with orders from
+ * folio_order(folio) - 1 to new_order, making it not suitable for any
+ * swapcache folio split. Only uniform split to order-0 can be used
+ * here.
+ */
+ if ((split_type == SPLIT_TYPE_NON_UNIFORM || new_order) && folio_test_swapcache(folio)) {
VM_WARN_ONCE(warns,
"Cannot split swapcache folio to non-0 order");
return false;
@@ -3556,41 +3749,160 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
return true;
}
-/* See comments in non_uniform_split_supported() */
-bool uniform_split_supported(struct folio *folio, unsigned int new_order,
- bool warns)
+static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
+ struct page *split_at, struct xa_state *xas,
+ struct address_space *mapping, bool do_lru,
+ struct list_head *list, enum split_type split_type,
+ pgoff_t end, int *nr_shmem_dropped, int extra_pins)
{
- if (folio_test_anon(folio)) {
- VM_WARN_ONCE(warns && new_order == 1,
- "Cannot split to order-1 folio");
- if (new_order == 1)
- return false;
- } else if (new_order) {
- if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- !mapping_large_folio_support(folio->mapping)) {
- VM_WARN_ONCE(warns,
- "Cannot split file folio to non-0 order");
- return false;
+ struct folio *end_folio = folio_next(folio);
+ struct folio *new_folio, *next;
+ int old_order = folio_order(folio);
+ int ret = 0;
+ struct deferred_split *ds_queue;
+
+ VM_WARN_ON_ONCE(!mapping && end);
+ /* Prevent deferred_split_scan() touching ->_refcount */
+ ds_queue = folio_split_queue_lock(folio);
+ if (folio_ref_freeze(folio, 1 + extra_pins)) {
+ struct swap_cluster_info *ci = NULL;
+ struct lruvec *lruvec;
+ int expected_refs;
+
+ if (old_order > 1) {
+ if (!list_empty(&folio->_deferred_list)) {
+ ds_queue->split_queue_len--;
+ /*
+ * Reinitialize page_deferred_list after removing the
+ * page from the split_queue, otherwise a subsequent
+ * split will see list corruption when checking the
+ * page_deferred_list.
+ */
+ list_del_init(&folio->_deferred_list);
+ }
+ if (folio_test_partially_mapped(folio)) {
+ folio_clear_partially_mapped(folio);
+ mod_mthp_stat(old_order,
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+ }
}
- }
+ split_queue_unlock(ds_queue);
+ if (mapping) {
+ int nr = folio_nr_pages(folio);
- if (new_order && folio_test_swapcache(folio)) {
- VM_WARN_ONCE(warns,
- "Cannot split swapcache folio to non-0 order");
- return false;
+ if (folio_test_pmd_mappable(folio) &&
+ new_order < HPAGE_PMD_ORDER) {
+ if (folio_test_swapbacked(folio)) {
+ lruvec_stat_mod_folio(folio,
+ NR_SHMEM_THPS, -nr);
+ } else {
+ lruvec_stat_mod_folio(folio,
+ NR_FILE_THPS, -nr);
+ filemap_nr_thps_dec(mapping);
+ }
+ }
+ }
+
+ if (folio_test_swapcache(folio)) {
+ if (mapping) {
+ VM_WARN_ON_ONCE_FOLIO(mapping, folio);
+ return -EINVAL;
+ }
+
+ ci = swap_cluster_get_and_lock(folio);
+ }
+
+ /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
+ if (do_lru)
+ lruvec = folio_lruvec_lock(folio);
+
+ ret = __split_unmapped_folio(folio, new_order, split_at, xas,
+ mapping, split_type);
+
+ /*
+ * Unfreeze after-split folios and put them back to the right
+ * list. @folio should be kept frozon until page cache
+ * entries are updated with all the other after-split folios
+ * to prevent others seeing stale page cache entries.
+ * As a result, new_folio starts from the next folio of
+ * @folio.
+ */
+ for (new_folio = folio_next(folio); new_folio != end_folio;
+ new_folio = next) {
+ unsigned long nr_pages = folio_nr_pages(new_folio);
+
+ next = folio_next(new_folio);
+
+ zone_device_private_split_cb(folio, new_folio);
+
+ expected_refs = folio_expected_ref_count(new_folio) + 1;
+ folio_ref_unfreeze(new_folio, expected_refs);
+
+ if (do_lru)
+ lru_add_split_folio(folio, new_folio, lruvec, list);
+
+ /*
+ * Anonymous folio with swap cache.
+ * NOTE: shmem in swap cache is not supported yet.
+ */
+ if (ci) {
+ __swap_cache_replace_folio(ci, folio, new_folio);
+ continue;
+ }
+
+ /* Anonymous folio without swap cache */
+ if (!mapping)
+ continue;
+
+ /* Add the new folio to the page cache. */
+ if (new_folio->index < end) {
+ __xa_store(&mapping->i_pages, new_folio->index,
+ new_folio, 0);
+ continue;
+ }
+
+ VM_WARN_ON_ONCE(!nr_shmem_dropped);
+ /* Drop folio beyond EOF: ->index >= end */
+ if (shmem_mapping(mapping) && nr_shmem_dropped)
+ *nr_shmem_dropped += nr_pages;
+ else if (folio_test_clear_dirty(new_folio))
+ folio_account_cleaned(
+ new_folio, inode_to_wb(mapping->host));
+ __filemap_remove_folio(new_folio, NULL);
+ folio_put_refs(new_folio, nr_pages);
+ }
+
+ zone_device_private_split_cb(folio, NULL);
+ /*
+ * Unfreeze @folio only after all page cache entries, which
+ * used to point to it, have been updated with new folios.
+ * Otherwise, a parallel folio_try_get() can grab @folio
+ * and its caller can see stale page cache entries.
+ */
+ expected_refs = folio_expected_ref_count(folio) + 1;
+ folio_ref_unfreeze(folio, expected_refs);
+
+ if (do_lru)
+ unlock_page_lruvec(lruvec);
+
+ if (ci)
+ swap_cluster_unlock(ci);
+ } else {
+ split_queue_unlock(ds_queue);
+ return -EAGAIN;
}
- return true;
+ return ret;
}
-/*
- * __folio_split: split a folio at @split_at to a @new_order folio
+/**
+ * __folio_split() - split a folio at @split_at to a @new_order folio
* @folio: folio to split
* @new_order: the order of the new folio
* @split_at: a page within the new folio
* @lock_at: a page within @folio to be left locked to caller
* @list: after-split folios will be put on it if non NULL
- * @uniform_split: perform uniform split or not (non-uniform split)
+ * @split_type: perform uniform split or not (non-uniform split)
*
* It calls __split_unmapped_folio() to perform uniform and non-uniform split.
* It is in charge of checking whether the split is supported or not and
@@ -3601,25 +3913,24 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
* 1. for uniform split, @lock_at points to one of @folio's subpages;
* 2. for buddy allocator like (non-uniform) split, @lock_at points to @folio.
*
- * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
+ * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
* split but not to @new_order, the caller needs to check)
*/
static int __folio_split(struct folio *folio, unsigned int new_order,
struct page *split_at, struct page *lock_at,
- struct list_head *list, bool uniform_split)
+ struct list_head *list, enum split_type split_type)
{
- struct deferred_split *ds_queue = get_deferred_split_queue(folio);
XA_STATE(xas, &folio->mapping->i_pages, folio->index);
struct folio *end_folio = folio_next(folio);
bool is_anon = folio_test_anon(folio);
struct address_space *mapping = NULL;
struct anon_vma *anon_vma = NULL;
- int order = folio_order(folio);
+ int old_order = folio_order(folio);
struct folio *new_folio, *next;
int nr_shmem_dropped = 0;
int remap_flags = 0;
int extra_pins, ret;
- pgoff_t end;
+ pgoff_t end = 0;
bool is_hzp;
VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
@@ -3638,14 +3949,10 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
if (!is_anon && !folio->mapping)
return -EBUSY;
- if (new_order >= folio_order(folio))
+ if (new_order >= old_order)
return -EINVAL;
- if (uniform_split && !uniform_split_supported(folio, new_order, true))
- return -EINVAL;
-
- if (!uniform_split &&
- !non_uniform_split_supported(folio, new_order, true))
+ if (!folio_split_supported(folio, new_order, split_type, /* warn = */ true))
return -EINVAL;
is_hzp = is_huge_zero_folio(folio);
@@ -3671,8 +3978,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
ret = -EBUSY;
goto out;
}
- mapping = NULL;
anon_vma_lock_write(anon_vma);
+ mapping = NULL;
} else {
unsigned int min_order;
gfp_t gfp;
@@ -3692,9 +3999,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
goto out;
}
- if (uniform_split) {
+ if (split_type == SPLIT_TYPE_UNIFORM) {
xas_set_order(&xas, folio->index, new_order);
- xas_split_alloc(&xas, folio, folio_order(folio), gfp);
+ xas_split_alloc(&xas, folio, old_order, gfp);
if (xas_error(&xas)) {
ret = xas_error(&xas);
goto out;
@@ -3742,127 +4049,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
}
}
- /* Prevent deferred_split_scan() touching ->_refcount */
- spin_lock(&ds_queue->split_queue_lock);
- if (folio_ref_freeze(folio, 1 + extra_pins)) {
- struct swap_cluster_info *ci = NULL;
- struct lruvec *lruvec;
- int expected_refs;
-
- if (folio_order(folio) > 1 &&
- !list_empty(&folio->_deferred_list)) {
- ds_queue->split_queue_len--;
- if (folio_test_partially_mapped(folio)) {
- folio_clear_partially_mapped(folio);
- mod_mthp_stat(folio_order(folio),
- MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
- }
- /*
- * Reinitialize page_deferred_list after removing the
- * page from the split_queue, otherwise a subsequent
- * split will see list corruption when checking the
- * page_deferred_list.
- */
- list_del_init(&folio->_deferred_list);
- }
- spin_unlock(&ds_queue->split_queue_lock);
- if (mapping) {
- int nr = folio_nr_pages(folio);
-
- if (folio_test_pmd_mappable(folio) &&
- new_order < HPAGE_PMD_ORDER) {
- if (folio_test_swapbacked(folio)) {
- __lruvec_stat_mod_folio(folio,
- NR_SHMEM_THPS, -nr);
- } else {
- __lruvec_stat_mod_folio(folio,
- NR_FILE_THPS, -nr);
- filemap_nr_thps_dec(mapping);
- }
- }
- }
-
- if (folio_test_swapcache(folio)) {
- if (mapping) {
- VM_WARN_ON_ONCE_FOLIO(mapping, folio);
- ret = -EINVAL;
- goto fail;
- }
-
- ci = swap_cluster_get_and_lock(folio);
- }
-
- /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
- lruvec = folio_lruvec_lock(folio);
-
- ret = __split_unmapped_folio(folio, new_order, split_at, &xas,
- mapping, uniform_split);
-
- /*
- * Unfreeze after-split folios and put them back to the right
- * list. @folio should be kept frozon until page cache
- * entries are updated with all the other after-split folios
- * to prevent others seeing stale page cache entries.
- * As a result, new_folio starts from the next folio of
- * @folio.
- */
- for (new_folio = folio_next(folio); new_folio != end_folio;
- new_folio = next) {
- unsigned long nr_pages = folio_nr_pages(new_folio);
-
- next = folio_next(new_folio);
-
- expected_refs = folio_expected_ref_count(new_folio) + 1;
- folio_ref_unfreeze(new_folio, expected_refs);
-
- lru_add_split_folio(folio, new_folio, lruvec, list);
-
- /*
- * Anonymous folio with swap cache.
- * NOTE: shmem in swap cache is not supported yet.
- */
- if (ci) {
- __swap_cache_replace_folio(ci, folio, new_folio);
- continue;
- }
-
- /* Anonymous folio without swap cache */
- if (!mapping)
- continue;
-
- /* Add the new folio to the page cache. */
- if (new_folio->index < end) {
- __xa_store(&mapping->i_pages, new_folio->index,
- new_folio, 0);
- continue;
- }
-
- /* Drop folio beyond EOF: ->index >= end */
- if (shmem_mapping(mapping))
- nr_shmem_dropped += nr_pages;
- else if (folio_test_clear_dirty(new_folio))
- folio_account_cleaned(
- new_folio, inode_to_wb(mapping->host));
- __filemap_remove_folio(new_folio, NULL);
- folio_put_refs(new_folio, nr_pages);
- }
- /*
- * Unfreeze @folio only after all page cache entries, which
- * used to point to it, have been updated with new folios.
- * Otherwise, a parallel folio_try_get() can grab @folio
- * and its caller can see stale page cache entries.
- */
- expected_refs = folio_expected_ref_count(folio) + 1;
- folio_ref_unfreeze(folio, expected_refs);
-
- unlock_page_lruvec(lruvec);
-
- if (ci)
- swap_cluster_unlock(ci);
- } else {
- spin_unlock(&ds_queue->split_queue_lock);
- ret = -EAGAIN;
- }
+ ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping,
+ true, list, split_type, end, &nr_shmem_dropped,
+ extra_pins);
fail:
if (mapping)
xas_unlock(&xas);
@@ -3872,9 +4061,10 @@ fail:
if (nr_shmem_dropped)
shmem_uncharge(mapping->host, nr_shmem_dropped);
- if (!ret && is_anon)
+ if (!ret && is_anon && !folio_is_device_private(folio))
remap_flags = RMP_USE_SHARED_ZEROPAGE;
- remap_page(folio, 1 << order, remap_flags);
+
+ remap_page(folio, 1 << old_order, remap_flags);
/*
* Unlock all after-split folios except the one containing
@@ -3905,9 +4095,51 @@ out_unlock:
i_mmap_unlock_read(mapping);
out:
xas_destroy(&xas);
- if (order == HPAGE_PMD_ORDER)
+ if (old_order == HPAGE_PMD_ORDER)
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
- count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
+ count_mthp_stat(old_order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
+ return ret;
+}
+
+/**
+ * folio_split_unmapped() - split a large anon folio that is already unmapped
+ * @folio: folio to split
+ * @new_order: the order of folios after split
+ *
+ * This function is a helper for splitting folios that have already been
+ * unmapped. The use case is that the device or the CPU can refuse to migrate
+ * THP pages in the middle of migration, due to allocation issues on either
+ * side.
+ *
+ * anon_vma_lock is not required to be held, mmap_read_lock() or
+ * mmap_write_lock() should be held. @folio is expected to be locked by the
+ * caller. device-private and non device-private folios are supported along
+ * with folios that are in the swapcache. @folio should also be unmapped and
+ * isolated from LRU (if applicable)
+ *
+ * Upon return, the folio is not remapped, split folios are not added to LRU,
+ * free_folio_and_swap_cache() is not called, and new folios remain locked.
+ *
+ * Return: 0 on success, -EAGAIN if the folio cannot be split (e.g., due to
+ * insufficient reference count or extra pins).
+ */
+int folio_split_unmapped(struct folio *folio, unsigned int new_order)
+{
+ int extra_pins, ret = 0;
+
+ VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);
+
+ if (!can_split_folio(folio, 1, &extra_pins))
+ return -EAGAIN;
+
+ local_irq_disable();
+ ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL,
+ NULL, false, NULL, SPLIT_TYPE_UNIFORM,
+ 0, NULL, extra_pins);
+ local_irq_enable();
return ret;
}
@@ -3958,22 +4190,22 @@ out:
* Returns -EINVAL when trying to split to an order that is incompatible
* with the folio. Splitting to order 0 is compatible with all folios.
*/
-int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order)
{
struct folio *folio = page_folio(page);
- return __folio_split(folio, new_order, &folio->page, page, list, true);
+ return __folio_split(folio, new_order, &folio->page, page, list,
+ SPLIT_TYPE_UNIFORM);
}
-/*
- * folio_split: split a folio at @split_at to a @new_order folio
+/**
+ * folio_split() - split a folio at @split_at to a @new_order folio
* @folio: folio to split
* @new_order: the order of the new folio
* @split_at: a page within the new folio
- *
- * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
- * split but not to @new_order, the caller needs to check)
+ * @list: after-split folios are added to @list if not null, otherwise to LRU
+ * list
*
* It has the same prerequisites and returns as
* split_huge_page_to_list_to_order().
@@ -3987,12 +4219,15 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
* [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
*
* After split, folio is left locked for caller.
+ *
+ * Return: 0 - successful, <0 - failed (if -ENOMEM is returned, @folio might be
+ * split but not to @new_order, the caller needs to check)
*/
int folio_split(struct folio *folio, unsigned int new_order,
struct page *split_at, struct list_head *list)
{
return __folio_split(folio, new_order, split_at, &folio->page, list,
- false);
+ SPLIT_TYPE_NON_UNIFORM);
}
int min_order_for_split(struct folio *folio)
@@ -4034,10 +4269,9 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
bool unqueued = false;
WARN_ON_ONCE(folio_ref_count(folio));
- WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
+ WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg_charged(folio));
- ds_queue = get_deferred_split_queue(folio);
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
if (folio_test_partially_mapped(folio)) {
@@ -4048,7 +4282,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
list_del_init(&folio->_deferred_list);
unqueued = true;
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ split_queue_unlock_irqrestore(ds_queue, flags);
return unqueued; /* useful for debug warnings */
}
@@ -4056,10 +4290,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio)
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
void deferred_split_folio(struct folio *folio, bool partially_mapped)
{
- struct deferred_split *ds_queue = get_deferred_split_queue(folio);
-#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = folio_memcg(folio);
-#endif
+ struct deferred_split *ds_queue;
unsigned long flags;
/*
@@ -4082,7 +4313,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
if (folio_test_swapcache(folio))
return;
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ ds_queue = folio_split_queue_lock_irqsave(folio, &flags);
if (partially_mapped) {
if (!folio_test_partially_mapped(folio)) {
folio_set_partially_mapped(folio);
@@ -4097,15 +4328,16 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
}
if (list_empty(&folio->_deferred_list)) {
+ struct mem_cgroup *memcg;
+
+ memcg = folio_split_queue_memcg(folio, ds_queue);
list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
ds_queue->split_queue_len++;
-#ifdef CONFIG_MEMCG
if (memcg)
set_shrinker_bit(memcg, folio_nid(folio),
- deferred_split_shrinker->id);
-#endif
+ shrinker_id(deferred_split_shrinker));
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ split_queue_unlock_irqrestore(ds_queue, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
@@ -4151,43 +4383,42 @@ static bool thp_underused(struct folio *folio)
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct pglist_data *pgdata = NODE_DATA(sc->nid);
- struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+ struct deferred_split *ds_queue;
unsigned long flags;
- LIST_HEAD(list);
- struct folio *folio, *next, *prev = NULL;
- int split = 0, removed = 0;
+ struct folio *folio, *next;
+ int split = 0, i;
+ struct folio_batch fbatch;
-#ifdef CONFIG_MEMCG
- if (sc->memcg)
- ds_queue = &sc->memcg->deferred_split_queue;
-#endif
+ folio_batch_init(&fbatch);
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+retry:
+ ds_queue = split_queue_lock_irqsave(sc->nid, sc->memcg, &flags);
/* Take pin on all head pages to avoid freeing them under us */
list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
_deferred_list) {
if (folio_try_get(folio)) {
- list_move(&folio->_deferred_list, &list);
- } else {
+ folio_batch_add(&fbatch, folio);
+ } else if (folio_test_partially_mapped(folio)) {
/* We lost race with folio_put() */
- if (folio_test_partially_mapped(folio)) {
- folio_clear_partially_mapped(folio);
- mod_mthp_stat(folio_order(folio),
- MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
- }
- list_del_init(&folio->_deferred_list);
- ds_queue->split_queue_len--;
+ folio_clear_partially_mapped(folio);
+ mod_mthp_stat(folio_order(folio),
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
+ list_del_init(&folio->_deferred_list);
+ ds_queue->split_queue_len--;
if (!--sc->nr_to_scan)
break;
+ if (!folio_batch_space(&fbatch))
+ break;
}
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ split_queue_unlock_irqrestore(ds_queue, flags);
- list_for_each_entry_safe(folio, next, &list, _deferred_list) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
bool did_split = false;
bool underused = false;
+ struct deferred_split *fqueue;
+ folio = fbatch.folios[i];
if (!folio_test_partially_mapped(folio)) {
/*
* See try_to_map_unused_to_zeropage(): we cannot
@@ -4210,38 +4441,27 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
}
folio_unlock(folio);
next:
+ if (did_split || !folio_test_partially_mapped(folio))
+ continue;
/*
- * split_folio() removes folio from list on success.
* Only add back to the queue if folio is partially mapped.
* If thp_underused returns false, or if split_folio fails
* in the case it was underused, then consider it used and
* don't add it back to split_queue.
*/
- if (did_split) {
- ; /* folio already removed from list */
- } else if (!folio_test_partially_mapped(folio)) {
- list_del_init(&folio->_deferred_list);
- removed++;
- } else {
- /*
- * That unlocked list_del_init() above would be unsafe,
- * unless its folio is separated from any earlier folios
- * left on the list (which may be concurrently unqueued)
- * by one safe folio with refcount still raised.
- */
- swap(folio, prev);
+ fqueue = folio_split_queue_lock_irqsave(folio, &flags);
+ if (list_empty(&folio->_deferred_list)) {
+ list_add_tail(&folio->_deferred_list, &fqueue->split_queue);
+ fqueue->split_queue_len++;
}
- if (folio)
- folio_put(folio);
+ split_queue_unlock_irqrestore(fqueue, flags);
}
+ folios_put(&fbatch);
- spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
- list_splice_tail(&list, &ds_queue->split_queue);
- ds_queue->split_queue_len -= removed;
- spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
-
- if (prev)
- folio_put(prev);
+ if (sc->nr_to_scan && !list_empty(&ds_queue->split_queue)) {
+ cond_resched();
+ goto retry;
+ }
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
@@ -4252,6 +4472,33 @@ next:
return split;
}
+#ifdef CONFIG_MEMCG
+void reparent_deferred_split_queue(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct deferred_split *ds_queue = &memcg->deferred_split_queue;
+ struct deferred_split *parent_ds_queue = &parent->deferred_split_queue;
+ int nid;
+
+ spin_lock_irq(&ds_queue->split_queue_lock);
+ spin_lock_nested(&parent_ds_queue->split_queue_lock, SINGLE_DEPTH_NESTING);
+
+ if (!ds_queue->split_queue_len)
+ goto unlock;
+
+ list_splice_tail_init(&ds_queue->split_queue, &parent_ds_queue->split_queue);
+ parent_ds_queue->split_queue_len += ds_queue->split_queue_len;
+ ds_queue->split_queue_len = 0;
+
+ for_each_node(nid)
+ set_shrinker_bit(parent, nid, shrinker_id(deferred_split_shrinker));
+
+unlock:
+ spin_unlock(&parent_ds_queue->split_queue_lock);
+ spin_unlock_irq(&ds_queue->split_queue_lock);
+}
+#endif
+
#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
@@ -4613,7 +4860,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
return 0;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
- pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ if (unlikely(!pmd_present(*pvmw->pmd)))
+ pmdval = pmdp_huge_get_and_clear(vma->vm_mm, address, pvmw->pmd);
+ else
+ pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
/* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
@@ -4655,30 +4905,48 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
unsigned long address = pvmw->address;
unsigned long haddr = address & HPAGE_PMD_MASK;
pmd_t pmde;
- swp_entry_t entry;
+ softleaf_t entry;
if (!(pvmw->pmd && !pvmw->pte))
return;
- entry = pmd_to_swp_entry(*pvmw->pmd);
+ entry = softleaf_from_pmd(*pvmw->pmd);
folio_get(folio);
pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
+
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
- if (is_writable_migration_entry(entry))
+ if (softleaf_is_migration_write(entry))
pmde = pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_mkuffd_wp(pmde);
- if (!is_migration_entry_young(entry))
+ if (!softleaf_is_migration_young(entry))
pmde = pmd_mkold(pmde);
/* NOTE: this may contain setting soft-dirty on some archs */
- if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
+ if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry))
pmde = pmd_mkdirty(pmde);
+ if (folio_is_device_private(folio)) {
+ swp_entry_t entry;
+
+ if (pmd_write(pmde))
+ entry = make_writable_device_private_entry(
+ page_to_pfn(new));
+ else
+ entry = make_readable_device_private_entry(
+ page_to_pfn(new));
+ pmde = swp_entry_to_pmd(entry);
+
+ if (pmd_swp_soft_dirty(*pvmw->pmd))
+ pmde = pmd_swp_mksoft_dirty(pmde);
+ if (pmd_swp_uffd_wp(*pvmw->pmd))
+ pmde = pmd_swp_mkuffd_wp(pmde);
+ }
+
if (folio_test_anon(folio)) {
rmap_t rmap_flags = RMAP_NONE;
- if (!is_readable_migration_entry(entry))
+ if (!softleaf_is_migration_read(entry))
rmap_flags |= RMAP_EXCLUSIVE;
folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0455119716ec..9e7815b4f058 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,7 +7,6 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
-#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/mmu_notifier.h>
#include <linux/nodemask.h>
@@ -19,7 +18,6 @@
#include <linux/mutex.h>
#include <linux/memblock.h>
#include <linux/minmax.h>
-#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
@@ -28,7 +26,7 @@
#include <linux/string_choices.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/jhash.h>
#include <linux/numa.h>
#include <linux/llist.h>
@@ -39,20 +37,19 @@
#include <linux/memory.h>
#include <linux/mm_inline.h>
#include <linux/padata.h>
+#include <linux/pgalloc.h>
#include <asm/page.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/setup.h>
#include <linux/io.h>
-#include <linux/hugetlb.h>
-#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/page_owner.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
#include "hugetlb_cma.h"
+#include "hugetlb_internal.h"
#include <linux/page-isolation.h>
int hugetlb_max_hstate __read_mostly;
@@ -119,7 +116,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end, bool take_locks);
@@ -427,17 +423,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
}
}
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+/*
+ * vma specific semaphore used for pmd sharing and fault/truncation
+ * synchronization
+ */
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
{
struct hugetlb_vma_lock *vma_lock;
/* Only establish in (flags) sharable vmas */
if (!vma || !(vma->vm_flags & VM_MAYSHARE))
- return;
+ return 0;
/* Should never get here with non-NULL vm_private_data */
if (vma->vm_private_data)
- return;
+ return -EINVAL;
vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
if (!vma_lock) {
@@ -452,13 +452,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
* allocation failure.
*/
pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
- return;
+ return -EINVAL;
}
kref_init(&vma_lock->refs);
init_rwsem(&vma_lock->rw_sema);
vma_lock->vma = vma;
vma->vm_private_data = vma_lock;
+
+ return 0;
}
/* Helper that removes a struct file_region from the resv_map cache and returns
@@ -1190,20 +1192,28 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
}
}
-static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
- VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
- VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+ VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma);
+ VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma);
- set_vma_private_data(vma, (unsigned long)map);
+ set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}
-static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map)
{
- VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
- VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+ VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+ VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
- set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+ desc->private_data = map;
+}
+
+static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags)
+{
+ VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+ VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
+
+ desc->private_data = (void *)((unsigned long)desc->private_data | flags);
}
static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
@@ -1213,6 +1223,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
+static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
+{
+ VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+
+ return ((unsigned long)desc->private_data) & flag;
+}
+
bool __vma_private_lock(struct vm_area_struct *vma)
{
return !(vma->vm_flags & VM_MAYSHARE) &&
@@ -1400,77 +1417,6 @@ err:
return NULL;
}
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h->next_node_to_{alloc|free} might
- * be outside of *nodes_allowed. Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
- nid = next_node_in(nid, *nodes_allowed);
- VM_BUG_ON(nid >= MAX_NUMNODES);
-
- return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
- if (!node_isset(nid, *nodes_allowed))
- nid = next_node_allowed(nid, nodes_allowed);
- return nid;
-}
-
-/*
- * returns the previously saved node ["this node"] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(int *next_node,
- nodemask_t *nodes_allowed)
-{
- int nid;
-
- VM_BUG_ON(!nodes_allowed);
-
- nid = get_valid_node_allowed(*next_node, nodes_allowed);
- *next_node = next_node_allowed(nid, nodes_allowed);
-
- return nid;
-}
-
-/*
- * helper for remove_pool_hugetlb_folio() - return the previously saved
- * node ["this node"] from which to free a huge page. Advance the
- * next node id whether or not we find a free huge page to free so
- * that the next attempt to free addresses the next node.
- */
-static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
-{
- int nid;
-
- VM_BUG_ON(!nodes_allowed);
-
- nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
- h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
-
- return nid;
-}
-
-#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \
- for (nr_nodes = nodes_weight(*mask); \
- nr_nodes > 0 && \
- ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \
- nr_nodes--)
-
-#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
- for (nr_nodes = nodes_weight(*mask); \
- nr_nodes > 0 && \
- ((node = hstate_next_node_to_free(hs, mask)) || 1); \
- nr_nodes--)
-
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
#ifdef CONFIG_CONTIG_ALLOC
static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask,
@@ -1526,8 +1472,8 @@ static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid,
*
* Must be called with hugetlb lock held.
*/
-static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
- bool adjust_surplus)
+void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool adjust_surplus)
{
int nid = folio_nid(folio);
@@ -1535,7 +1481,7 @@ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
lockdep_assert_held(&hugetlb_lock);
- if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ if (hstate_is_gigantic_no_runtime(h))
return;
list_del(&folio->lru);
@@ -1562,8 +1508,8 @@ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
h->nr_huge_pages_node[nid]--;
}
-static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
- bool adjust_surplus)
+void add_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool adjust_surplus)
{
int nid = folio_nid(folio);
@@ -1597,7 +1543,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
{
bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);
- if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ if (hstate_is_gigantic_no_runtime(h))
return;
/*
@@ -1894,7 +1840,7 @@ static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio)
h->nr_huge_pages_node[folio_nid(folio)]++;
}
-static void init_new_hugetlb_folio(struct folio *folio)
+void init_new_hugetlb_folio(struct folio *folio)
{
__folio_set_hugetlb(folio);
INIT_LIST_HEAD(&folio->lru);
@@ -2006,8 +1952,8 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
return folio;
}
-static void prep_and_add_allocated_folios(struct hstate *h,
- struct list_head *folio_list)
+void prep_and_add_allocated_folios(struct hstate *h,
+ struct list_head *folio_list)
{
unsigned long flags;
struct folio *folio, *tmp_f;
@@ -2212,7 +2158,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
{
struct folio *folio = NULL;
- if (hstate_is_gigantic(h))
+ if (hstate_is_gigantic_no_runtime(h))
return NULL;
spin_lock_irq(&hugetlb_lock);
@@ -2491,7 +2437,7 @@ static void return_unused_surplus_pages(struct hstate *h,
/* Uncommit the reservation */
h->resv_huge_pages -= unused_resv_pages;
- if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ if (hstate_is_gigantic_no_runtime(h))
goto out;
/*
@@ -2934,7 +2880,7 @@ typedef enum {
* NOTE: This is mostly identical to MAP_CHG_NEEDED, except
* that currently vma_needs_reservation() has an unwanted side
* effect to either use end() or commit() to complete the
- * transaction. Hence it needs to differenciate from NEEDED.
+ * transaction. Hence it needs to differentiate from NEEDED.
*/
MAP_CHG_ENFORCED = 2,
} map_chg_state;
@@ -3215,7 +3161,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
unsigned long start_page_number,
unsigned long end_page_number)
{
- enum zone_type zone = zone_idx(folio_zone(folio));
+ enum zone_type zone = folio_zonenum(folio);
int nid = folio_nid(folio);
struct page *page = folio_page(folio, start_page_number);
unsigned long head_pfn = folio_pfn(folio);
@@ -3248,7 +3194,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
ret = folio_ref_freeze(folio, 1);
VM_BUG_ON(!ret);
hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages);
- prep_compound_head((struct page *)folio, huge_page_order(h));
+ prep_compound_head(&folio->page, huge_page_order(h));
}
static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
@@ -3705,7 +3651,7 @@ static void __init hugetlb_init_hstates(void)
* - If CMA allocation is possible, we can not demote
* HUGETLB_PAGE_ORDER or smaller size pages.
*/
- if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ if (hstate_is_gigantic_no_runtime(h))
continue;
if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER)
continue;
@@ -4062,8 +4008,8 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
return rc;
}
-static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
- unsigned long nr_to_demote)
+long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
+ unsigned long nr_to_demote)
__must_hold(&hugetlb_lock)
{
int nr_nodes, node;
@@ -4131,58 +4077,14 @@ static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
return -EBUSY;
}
-#define HSTATE_ATTR_RO(_name) \
- static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
-
-#define HSTATE_ATTR_WO(_name) \
- static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
-
-#define HSTATE_ATTR(_name) \
- static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
-
-static struct kobject *hugepages_kobj;
-static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
-
-static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
-
-static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
-{
- int i;
-
- for (i = 0; i < HUGE_MAX_HSTATE; i++)
- if (hstate_kobjs[i] == kobj) {
- if (nidp)
- *nidp = NUMA_NO_NODE;
- return &hstates[i];
- }
-
- return kobj_to_node_hstate(kobj, nidp);
-}
-
-static ssize_t nr_hugepages_show_common(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct hstate *h;
- unsigned long nr_huge_pages;
- int nid;
-
- h = kobj_to_hstate(kobj, &nid);
- if (nid == NUMA_NO_NODE)
- nr_huge_pages = h->nr_huge_pages;
- else
- nr_huge_pages = h->nr_huge_pages_node[nid];
-
- return sysfs_emit(buf, "%lu\n", nr_huge_pages);
-}
-
-static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
+ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
struct hstate *h, int nid,
unsigned long count, size_t len)
{
int err;
nodemask_t nodes_allowed, *n_mask;
- if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ if (hstate_is_gigantic_no_runtime(h))
return -EINVAL;
if (nid == NUMA_NO_NODE) {
@@ -4208,458 +4110,6 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
return err ? err : len;
}
-static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
- struct kobject *kobj, const char *buf,
- size_t len)
-{
- struct hstate *h;
- unsigned long count;
- int nid;
- int err;
-
- err = kstrtoul(buf, 10, &count);
- if (err)
- return err;
-
- h = kobj_to_hstate(kobj, &nid);
- return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
-}
-
-static ssize_t nr_hugepages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return nr_hugepages_show_common(kobj, attr, buf);
-}
-
-static ssize_t nr_hugepages_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t len)
-{
- return nr_hugepages_store_common(false, kobj, buf, len);
-}
-HSTATE_ATTR(nr_hugepages);
-
-#ifdef CONFIG_NUMA
-
-/*
- * hstate attribute for optionally mempolicy-based constraint on persistent
- * huge page alloc/free.
- */
-static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
-{
- return nr_hugepages_show_common(kobj, attr, buf);
-}
-
-static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t len)
-{
- return nr_hugepages_store_common(true, kobj, buf, len);
-}
-HSTATE_ATTR(nr_hugepages_mempolicy);
-#endif
-
-
-static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct hstate *h = kobj_to_hstate(kobj, NULL);
- return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
-}
-
-static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t count)
-{
- int err;
- unsigned long input;
- struct hstate *h = kobj_to_hstate(kobj, NULL);
-
- if (hstate_is_gigantic(h))
- return -EINVAL;
-
- err = kstrtoul(buf, 10, &input);
- if (err)
- return err;
-
- spin_lock_irq(&hugetlb_lock);
- h->nr_overcommit_huge_pages = input;
- spin_unlock_irq(&hugetlb_lock);
-
- return count;
-}
-HSTATE_ATTR(nr_overcommit_hugepages);
-
-static ssize_t free_hugepages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct hstate *h;
- unsigned long free_huge_pages;
- int nid;
-
- h = kobj_to_hstate(kobj, &nid);
- if (nid == NUMA_NO_NODE)
- free_huge_pages = h->free_huge_pages;
- else
- free_huge_pages = h->free_huge_pages_node[nid];
-
- return sysfs_emit(buf, "%lu\n", free_huge_pages);
-}
-HSTATE_ATTR_RO(free_hugepages);
-
-static ssize_t resv_hugepages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct hstate *h = kobj_to_hstate(kobj, NULL);
- return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
-}
-HSTATE_ATTR_RO(resv_hugepages);
-
-static ssize_t surplus_hugepages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct hstate *h;
- unsigned long surplus_huge_pages;
- int nid;
-
- h = kobj_to_hstate(kobj, &nid);
- if (nid == NUMA_NO_NODE)
- surplus_huge_pages = h->surplus_huge_pages;
- else
- surplus_huge_pages = h->surplus_huge_pages_node[nid];
-
- return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
-}
-HSTATE_ATTR_RO(surplus_hugepages);
-
-static ssize_t demote_store(struct kobject *kobj,
- struct kobj_attribute *attr, const char *buf, size_t len)
-{
- unsigned long nr_demote;
- unsigned long nr_available;
- nodemask_t nodes_allowed, *n_mask;
- struct hstate *h;
- int err;
- int nid;
-
- err = kstrtoul(buf, 10, &nr_demote);
- if (err)
- return err;
- h = kobj_to_hstate(kobj, &nid);
-
- if (nid != NUMA_NO_NODE) {
- init_nodemask_of_node(&nodes_allowed, nid);
- n_mask = &nodes_allowed;
- } else {
- n_mask = &node_states[N_MEMORY];
- }
-
- /* Synchronize with other sysfs operations modifying huge pages */
- mutex_lock(&h->resize_lock);
- spin_lock_irq(&hugetlb_lock);
-
- while (nr_demote) {
- long rc;
-
- /*
- * Check for available pages to demote each time thorough the
- * loop as demote_pool_huge_page will drop hugetlb_lock.
- */
- if (nid != NUMA_NO_NODE)
- nr_available = h->free_huge_pages_node[nid];
- else
- nr_available = h->free_huge_pages;
- nr_available -= h->resv_huge_pages;
- if (!nr_available)
- break;
-
- rc = demote_pool_huge_page(h, n_mask, nr_demote);
- if (rc < 0) {
- err = rc;
- break;
- }
-
- nr_demote -= rc;
- }
-
- spin_unlock_irq(&hugetlb_lock);
- mutex_unlock(&h->resize_lock);
-
- if (err)
- return err;
- return len;
-}
-HSTATE_ATTR_WO(demote);
-
-static ssize_t demote_size_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct hstate *h = kobj_to_hstate(kobj, NULL);
- unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
-
- return sysfs_emit(buf, "%lukB\n", demote_size);
-}
-
-static ssize_t demote_size_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct hstate *h, *demote_hstate;
- unsigned long demote_size;
- unsigned int demote_order;
-
- demote_size = (unsigned long)memparse(buf, NULL);
-
- demote_hstate = size_to_hstate(demote_size);
- if (!demote_hstate)
- return -EINVAL;
- demote_order = demote_hstate->order;
- if (demote_order < HUGETLB_PAGE_ORDER)
- return -EINVAL;
-
- /* demote order must be smaller than hstate order */
- h = kobj_to_hstate(kobj, NULL);
- if (demote_order >= h->order)
- return -EINVAL;
-
- /* resize_lock synchronizes access to demote size and writes */
- mutex_lock(&h->resize_lock);
- h->demote_order = demote_order;
- mutex_unlock(&h->resize_lock);
-
- return count;
-}
-HSTATE_ATTR(demote_size);
-
-static struct attribute *hstate_attrs[] = {
- &nr_hugepages_attr.attr,
- &nr_overcommit_hugepages_attr.attr,
- &free_hugepages_attr.attr,
- &resv_hugepages_attr.attr,
- &surplus_hugepages_attr.attr,
-#ifdef CONFIG_NUMA
- &nr_hugepages_mempolicy_attr.attr,
-#endif
- NULL,
-};
-
-static const struct attribute_group hstate_attr_group = {
- .attrs = hstate_attrs,
-};
-
-static struct attribute *hstate_demote_attrs[] = {
- &demote_size_attr.attr,
- &demote_attr.attr,
- NULL,
-};
-
-static const struct attribute_group hstate_demote_attr_group = {
- .attrs = hstate_demote_attrs,
-};
-
-static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
- struct kobject **hstate_kobjs,
- const struct attribute_group *hstate_attr_group)
-{
- int retval;
- int hi = hstate_index(h);
-
- hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
- if (!hstate_kobjs[hi])
- return -ENOMEM;
-
- retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
- if (retval) {
- kobject_put(hstate_kobjs[hi]);
- hstate_kobjs[hi] = NULL;
- return retval;
- }
-
- if (h->demote_order) {
- retval = sysfs_create_group(hstate_kobjs[hi],
- &hstate_demote_attr_group);
- if (retval) {
- pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
- sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
- kobject_put(hstate_kobjs[hi]);
- hstate_kobjs[hi] = NULL;
- return retval;
- }
- }
-
- return 0;
-}
-
-#ifdef CONFIG_NUMA
-static bool hugetlb_sysfs_initialized __ro_after_init;
-
-/*
- * node_hstate/s - associate per node hstate attributes, via their kobjects,
- * with node devices in node_devices[] using a parallel array. The array
- * index of a node device or _hstate == node id.
- * This is here to avoid any static dependency of the node device driver, in
- * the base kernel, on the hugetlb module.
- */
-struct node_hstate {
- struct kobject *hugepages_kobj;
- struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
-};
-static struct node_hstate node_hstates[MAX_NUMNODES];
-
-/*
- * A subset of global hstate attributes for node devices
- */
-static struct attribute *per_node_hstate_attrs[] = {
- &nr_hugepages_attr.attr,
- &free_hugepages_attr.attr,
- &surplus_hugepages_attr.attr,
- NULL,
-};
-
-static const struct attribute_group per_node_hstate_attr_group = {
- .attrs = per_node_hstate_attrs,
-};
-
-/*
- * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
- * Returns node id via non-NULL nidp.
- */
-static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
-{
- int nid;
-
- for (nid = 0; nid < nr_node_ids; nid++) {
- struct node_hstate *nhs = &node_hstates[nid];
- int i;
- for (i = 0; i < HUGE_MAX_HSTATE; i++)
- if (nhs->hstate_kobjs[i] == kobj) {
- if (nidp)
- *nidp = nid;
- return &hstates[i];
- }
- }
-
- BUG();
- return NULL;
-}
-
-/*
- * Unregister hstate attributes from a single node device.
- * No-op if no hstate attributes attached.
- */
-void hugetlb_unregister_node(struct node *node)
-{
- struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->dev.id];
-
- if (!nhs->hugepages_kobj)
- return; /* no hstate attributes */
-
- for_each_hstate(h) {
- int idx = hstate_index(h);
- struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
-
- if (!hstate_kobj)
- continue;
- if (h->demote_order)
- sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
- sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
- kobject_put(hstate_kobj);
- nhs->hstate_kobjs[idx] = NULL;
- }
-
- kobject_put(nhs->hugepages_kobj);
- nhs->hugepages_kobj = NULL;
-}
-
-
-/*
- * Register hstate attributes for a single node device.
- * No-op if attributes already registered.
- */
-void hugetlb_register_node(struct node *node)
-{
- struct hstate *h;
- struct node_hstate *nhs = &node_hstates[node->dev.id];
- int err;
-
- if (!hugetlb_sysfs_initialized)
- return;
-
- if (nhs->hugepages_kobj)
- return; /* already allocated */
-
- nhs->hugepages_kobj = kobject_create_and_add("hugepages",
- &node->dev.kobj);
- if (!nhs->hugepages_kobj)
- return;
-
- for_each_hstate(h) {
- err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
- nhs->hstate_kobjs,
- &per_node_hstate_attr_group);
- if (err) {
- pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
- h->name, node->dev.id);
- hugetlb_unregister_node(node);
- break;
- }
- }
-}
-
-/*
- * hugetlb init time: register hstate attributes for all registered node
- * devices of nodes that have memory. All on-line nodes should have
- * registered their associated device by this time.
- */
-static void __init hugetlb_register_all_nodes(void)
-{
- int nid;
-
- for_each_online_node(nid)
- hugetlb_register_node(node_devices[nid]);
-}
-#else /* !CONFIG_NUMA */
-
-static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
-{
- BUG();
- if (nidp)
- *nidp = -1;
- return NULL;
-}
-
-static void hugetlb_register_all_nodes(void) { }
-
-#endif
-
-static void __init hugetlb_sysfs_init(void)
-{
- struct hstate *h;
- int err;
-
- hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
- if (!hugepages_kobj)
- return;
-
- for_each_hstate(h) {
- err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
- hstate_kobjs, &hstate_attr_group);
- if (err)
- pr_err("HugeTLB: Unable to add hstate %s\n", h->name);
- }
-
-#ifdef CONFIG_NUMA
- hugetlb_sysfs_initialized = true;
-#endif
- hugetlb_register_all_nodes();
-}
-
-#ifdef CONFIG_SYSCTL
-static void hugetlb_sysctl_init(void);
-#else
-static inline void hugetlb_sysctl_init(void) { }
-#endif
-
static int __init hugetlb_init(void)
{
int i;
@@ -5092,131 +4542,6 @@ static unsigned int allowed_mems_nr(struct hstate *h)
return nr;
}
-#ifdef CONFIG_SYSCTL
-static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write,
- void *buffer, size_t *length,
- loff_t *ppos, unsigned long *out)
-{
- struct ctl_table dup_table;
-
- /*
- * In order to avoid races with __do_proc_doulongvec_minmax(), we
- * can duplicate the @table and alter the duplicate of it.
- */
- dup_table = *table;
- dup_table.data = out;
-
- return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
-}
-
-static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
- const struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
-{
- struct hstate *h = &default_hstate;
- unsigned long tmp = h->max_huge_pages;
- int ret;
-
- if (!hugepages_supported())
- return -EOPNOTSUPP;
-
- ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
- &tmp);
- if (ret)
- goto out;
-
- if (write)
- ret = __nr_hugepages_store_common(obey_mempolicy, h,
- NUMA_NO_NODE, tmp, *length);
-out:
- return ret;
-}
-
-static int hugetlb_sysctl_handler(const struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
-{
-
- return hugetlb_sysctl_handler_common(false, table, write,
- buffer, length, ppos);
-}
-
-#ifdef CONFIG_NUMA
-static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
-{
- return hugetlb_sysctl_handler_common(true, table, write,
- buffer, length, ppos);
-}
-#endif /* CONFIG_NUMA */
-
-static int hugetlb_overcommit_handler(const struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
-{
- struct hstate *h = &default_hstate;
- unsigned long tmp;
- int ret;
-
- if (!hugepages_supported())
- return -EOPNOTSUPP;
-
- tmp = h->nr_overcommit_huge_pages;
-
- if (write && hstate_is_gigantic(h))
- return -EINVAL;
-
- ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
- &tmp);
- if (ret)
- goto out;
-
- if (write) {
- spin_lock_irq(&hugetlb_lock);
- h->nr_overcommit_huge_pages = tmp;
- spin_unlock_irq(&hugetlb_lock);
- }
-out:
- return ret;
-}
-
-static const struct ctl_table hugetlb_table[] = {
- {
- .procname = "nr_hugepages",
- .data = NULL,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = hugetlb_sysctl_handler,
- },
-#ifdef CONFIG_NUMA
- {
- .procname = "nr_hugepages_mempolicy",
- .data = NULL,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = &hugetlb_mempolicy_sysctl_handler,
- },
-#endif
- {
- .procname = "hugetlb_shm_group",
- .data = &sysctl_hugetlb_shm_group,
- .maxlen = sizeof(gid_t),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "nr_overcommit_hugepages",
- .data = NULL,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = hugetlb_overcommit_handler,
- },
-};
-
-static void __init hugetlb_sysctl_init(void)
-{
- register_sysctl_init("vm", hugetlb_table);
-}
-#endif /* CONFIG_SYSCTL */
-
void hugetlb_report_meminfo(struct seq_file *m)
{
struct hstate *h;
@@ -5521,32 +4846,6 @@ static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
set_huge_ptep_writable(vma, address, ptep);
}
-bool is_hugetlb_entry_migration(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return false;
- swp = pte_to_swp_entry(pte);
- if (is_migration_entry(swp))
- return true;
- else
- return false;
-}
-
-bool is_hugetlb_entry_hwpoisoned(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return false;
- swp = pte_to_swp_entry(pte);
- if (is_hwpoison_entry(swp))
- return true;
- else
- return false;
-}
-
static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct folio *new_folio, pte_t old, unsigned long sz)
@@ -5575,6 +4874,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
unsigned long npages = pages_per_huge_page(h);
struct mmu_notifier_range range;
unsigned long last_addr_mask;
+ softleaf_t softleaf;
int ret = 0;
if (cow) {
@@ -5622,26 +4922,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
again:
if (huge_pte_none(entry)) {
- /*
- * Skip if src entry none.
- */
- ;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
+ /* Skip if src entry none. */
+ goto next;
+ }
+
+ softleaf = softleaf_from_pte(entry);
+ if (unlikely(softleaf_is_hwpoison(softleaf))) {
if (!userfaultfd_wp(dst_vma))
entry = huge_pte_clear_uffd_wp(entry);
set_huge_pte_at(dst, addr, dst_pte, entry, sz);
- } else if (unlikely(is_hugetlb_entry_migration(entry))) {
- swp_entry_t swp_entry = pte_to_swp_entry(entry);
+ } else if (unlikely(softleaf_is_migration(softleaf))) {
bool uffd_wp = pte_swp_uffd_wp(entry);
- if (!is_readable_migration_entry(swp_entry) && cow) {
+ if (!softleaf_is_migration_read(softleaf) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
- swp_entry = make_readable_migration_entry(
- swp_offset(swp_entry));
- entry = swp_entry_to_pte(swp_entry);
+ softleaf = make_readable_migration_entry(
+ swp_offset(softleaf));
+ entry = swp_entry_to_pte(softleaf);
if (userfaultfd_wp(src_vma) && uffd_wp)
entry = pte_swp_mkuffd_wp(entry);
set_huge_pte_at(src, addr, src_pte, entry, sz);
@@ -5649,9 +4949,8 @@ again:
if (!userfaultfd_wp(dst_vma))
entry = huge_pte_clear_uffd_wp(entry);
set_huge_pte_at(dst, addr, dst_pte, entry, sz);
- } else if (unlikely(is_pte_marker(entry))) {
- pte_marker marker = copy_pte_marker(
- pte_to_swp_entry(entry), dst_vma);
+ } else if (unlikely(pte_is_marker(entry))) {
+ const pte_marker marker = copy_pte_marker(softleaf, dst_vma);
if (marker)
set_huge_pte_at(dst, addr, dst_pte,
@@ -5708,9 +5007,7 @@ again:
}
hugetlb_install_folio(dst_vma, dst_pte, addr,
new_folio, src_pte_old, sz);
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
- continue;
+ goto next;
}
if (cow) {
@@ -5731,6 +5028,8 @@ again:
set_huge_pte_at(dst, addr, dst_pte, entry, sz);
hugetlb_count_add(npages, dst);
}
+
+next:
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
}
@@ -5767,13 +5066,13 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
- if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) {
huge_pte_clear(mm, new_addr, dst_pte, sz);
- else {
+ } else {
if (need_clear_uffd_wp) {
if (pte_present(pte))
pte = huge_pte_clear_uffd_wp(pte);
- else if (is_swap_pte(pte))
+ else
pte = pte_swp_clear_uffd_wp(pte);
}
set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
@@ -6007,7 +5306,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
/*
* If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
* could defer the flush until now, since by holding i_mmap_rwsem we
- * guaranteed that the last refernece would not be dropped. But we must
+ * guaranteed that the last reference would not be dropped. But we must
* do the flushing before we return, as otherwise i_mmap_rwsem will be
* dropped and the last reference to the shared PMDs page might be
* dropped as well.
@@ -6586,7 +5885,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
*/
- if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))
+ if (unlikely(pte_is_uffd_wp_marker(vmf->orig_pte)))
new_pte = huge_pte_mkuffd_wp(new_pte);
set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));
@@ -6712,36 +6011,37 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte);
- if (huge_pte_none_mostly(vmf.orig_pte)) {
- if (is_pte_marker(vmf.orig_pte)) {
- pte_marker marker =
- pte_marker_get(pte_to_swp_entry(vmf.orig_pte));
-
- if (marker & PTE_MARKER_POISONED) {
- ret = VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
- goto out_mutex;
- } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
- /* This isn't supported in hugetlb. */
- ret = VM_FAULT_SIGSEGV;
- goto out_mutex;
- }
- }
-
+ if (huge_pte_none(vmf.orig_pte))
/*
- * Other PTE markers should be handled the same way as none PTE.
- *
* hugetlb_no_page will drop vma lock and hugetlb fault
* mutex internally, which make us return immediately.
*/
return hugetlb_no_page(mapping, &vmf);
+
+ if (pte_is_marker(vmf.orig_pte)) {
+ const pte_marker marker =
+ softleaf_to_marker(softleaf_from_pte(vmf.orig_pte));
+
+ if (marker & PTE_MARKER_POISONED) {
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
+ goto out_mutex;
+ } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
+ /* This isn't supported in hugetlb. */
+ ret = VM_FAULT_SIGSEGV;
+ goto out_mutex;
+ }
+
+ return hugetlb_no_page(mapping, &vmf);
}
ret = 0;
/* Not present, either a migration or a hwpoisoned entry */
- if (!pte_present(vmf.orig_pte)) {
- if (is_hugetlb_entry_migration(vmf.orig_pte)) {
+ if (!pte_present(vmf.orig_pte) && !huge_pte_none(vmf.orig_pte)) {
+ const softleaf_t softleaf = softleaf_from_pte(vmf.orig_pte);
+
+ if (softleaf_is_migration(softleaf)) {
/*
* Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the
@@ -6752,9 +6052,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
migration_entry_wait_huge(vma, vmf.address, vmf.pte);
return 0;
- } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte))
+ }
+ if (softleaf_is_hwpoison(softleaf)) {
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
+ }
+
goto out_mutex;
}
@@ -6903,6 +6206,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
int ret = -ENOMEM;
struct folio *folio;
bool folio_in_pagecache = false;
+ pte_t dst_ptep;
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
ptl = huge_pte_lock(h, dst_mm, dst_pte);
@@ -7042,13 +6346,14 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
if (folio_test_hwpoison(folio))
goto out_release_unlock;
+ ret = -EEXIST;
+
+ dst_ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte);
/*
- * We allow to overwrite a pte marker: consider when both MISSING|WP
- * registered, we firstly wr-protect a none pte which has no page cache
- * page backing it, then access the page.
+ * See comment about UFFD marker overwriting in
+ * mfill_atomic_install_pte().
*/
- ret = -EEXIST;
- if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte)))
+ if (!huge_pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep))
goto out_release_unlock;
if (folio_in_pagecache)
@@ -7134,7 +6439,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
i_mmap_lock_write(vma->vm_file->f_mapping);
last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
+ softleaf_t entry;
spinlock_t *ptl;
+
ptep = hugetlb_walk(vma, address, psize);
if (!ptep) {
if (!uffd_wp) {
@@ -7166,14 +6473,23 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
continue;
}
pte = huge_ptep_get(mm, address, ptep);
- if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ if (huge_pte_none(pte)) {
+ if (unlikely(uffd_wp))
+ /* Safe to modify directly (none->non-present). */
+ set_huge_pte_at(mm, address, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP),
+ psize);
+ goto next;
+ }
+
+ entry = softleaf_from_pte(pte);
+ if (unlikely(softleaf_is_hwpoison(entry))) {
/* Nothing to do. */
- } else if (unlikely(is_hugetlb_entry_migration(pte))) {
- swp_entry_t entry = pte_to_swp_entry(pte);
- struct folio *folio = pfn_swap_entry_folio(entry);
+ } else if (unlikely(softleaf_is_migration(entry))) {
+ struct folio *folio = softleaf_to_folio(entry);
pte_t newpte = pte;
- if (is_writable_migration_entry(entry)) {
+ if (softleaf_is_migration_write(entry)) {
if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
@@ -7190,17 +6506,17 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
newpte = pte_swp_clear_uffd_wp(newpte);
if (!pte_same(pte, newpte))
set_huge_pte_at(mm, address, ptep, newpte, psize);
- } else if (unlikely(is_pte_marker(pte))) {
+ } else if (unlikely(pte_is_marker(pte))) {
/*
* Do nothing on a poison marker; page is
- * corrupted, permissons do not apply. Here
+ * corrupted, permissions do not apply. Here
* pte_marker_uffd_wp()==true implies !poison
* because they're mutual exclusive.
*/
- if (pte_marker_uffd_wp(pte) && uffd_wp_resolve)
+ if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve)
/* Safe to modify directly (non-present->none). */
huge_pte_clear(mm, address, ptep, psize);
- } else if (!huge_pte_none(pte)) {
+ } else {
pte_t old_pte;
unsigned int shift = huge_page_shift(hstate_vma(vma));
@@ -7213,16 +6529,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
- } else {
- /* None pte */
- if (unlikely(uffd_wp))
- /* Safe to modify directly (none->non-present). */
- set_huge_pte_at(mm, address, ptep,
- make_pte_marker(PTE_MARKER_UFFD_WP),
- psize);
}
- spin_unlock(ptl);
+next:
+ spin_unlock(ptl);
cond_resched();
}
/*
@@ -7259,9 +6569,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
*/
long hugetlb_reserve_pages(struct inode *inode,
- long from, long to,
- struct vm_area_struct *vma,
- vm_flags_t vm_flags)
+ long from, long to,
+ struct vm_area_desc *desc,
+ vm_flags_t vm_flags)
{
long chg = -1, add = -1, spool_resv, gbl_resv;
struct hstate *h = hstate_inode(inode);
@@ -7277,12 +6587,6 @@ long hugetlb_reserve_pages(struct inode *inode,
}
/*
- * vma specific semaphore used for pmd sharing and fault/truncation
- * synchronization
- */
- hugetlb_vma_lock_alloc(vma);
-
- /*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
* without using reserves
@@ -7294,9 +6598,9 @@ long hugetlb_reserve_pages(struct inode *inode,
* Shared mappings base their reservation on the number of pages that
* are already allocated on behalf of the file. Private mappings need
* to reserve the full area even if read-only as mprotect() may be
- * called to make the mapping read-write. Assume !vma is a shm mapping
+ * called to make the mapping read-write. Assume !desc is a shm mapping
*/
- if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ if (!desc || desc->vm_flags & VM_MAYSHARE) {
/*
* resv_map can not be NULL as hugetlb_reserve_pages is only
* called for inodes for which resv_maps were created (see
@@ -7313,8 +6617,8 @@ long hugetlb_reserve_pages(struct inode *inode,
chg = to - from;
- set_vma_resv_map(vma, resv_map);
- set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+ set_vma_desc_resv_map(desc, resv_map);
+ set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER);
}
if (chg < 0)
@@ -7324,7 +6628,7 @@ long hugetlb_reserve_pages(struct inode *inode,
chg * pages_per_huge_page(h), &h_cg) < 0)
goto out_err;
- if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+ if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) {
/* For private mappings, the hugetlb_cgroup uncharge info hangs
* of the resv_map.
*/
@@ -7358,7 +6662,7 @@ long hugetlb_reserve_pages(struct inode *inode,
* consumed reservations are stored in the map. Hence, nothing
* else has to be done for private mappings here
*/
- if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ if (!desc || desc->vm_flags & VM_MAYSHARE) {
add = region_add(resv_map, from, to, regions_needed, h, h_cg);
if (unlikely(add < 0)) {
@@ -7412,16 +6716,15 @@ out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
- hugetlb_vma_lock_free(vma);
- if (!vma || vma->vm_flags & VM_MAYSHARE)
+ if (!desc || desc->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
*/
if (chg >= 0 && add < 0)
region_abort(resv_map, from, to, regions_needed);
- if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) {
kref_put(&resv_map->refs, resv_map_release);
- set_vma_resv_map(vma, NULL);
+ set_vma_desc_resv_map(desc, NULL);
}
return chg < 0 ? chg : add < 0 ? add : -EINVAL;
}
diff --git a/mm/hugetlb_internal.h b/mm/hugetlb_internal.h
new file mode 100644
index 000000000000..1d2f870deccf
--- /dev/null
+++ b/mm/hugetlb_internal.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Internal HugeTLB definitions.
+ * (C) Nadia Yvette Chambers, April 2004
+ */
+
+#ifndef _LINUX_HUGETLB_INTERNAL_H
+#define _LINUX_HUGETLB_INTERNAL_H
+
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+
+/*
+ * Check if the hstate represents gigantic pages but gigantic page
+ * runtime support is not available. This is a common condition used to
+ * skip operations that cannot be performed on gigantic pages when runtime
+ * support is disabled.
+ */
+static inline bool hstate_is_gigantic_no_runtime(struct hstate *h)
+{
+ return hstate_is_gigantic(h) && !gigantic_page_runtime_supported();
+}
+
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed. Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static inline int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ nid = next_node_in(nid, *nodes_allowed);
+ VM_BUG_ON(nid >= MAX_NUMNODES);
+
+ return nid;
+}
+
+static inline int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ if (!node_isset(nid, *nodes_allowed))
+ nid = next_node_allowed(nid, nodes_allowed);
+ return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static inline int hstate_next_node_to_alloc(int *next_node,
+ nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(*next_node, nodes_allowed);
+ *next_node = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+/*
+ * helper for remove_pool_hugetlb_folio() - return the previously saved
+ * node ["this node"] from which to free a huge page. Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static inline int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+ h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \
+ nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+ nr_nodes--)
+
+extern void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool adjust_surplus);
+extern void add_hugetlb_folio(struct hstate *h, struct folio *folio,
+ bool adjust_surplus);
+extern void init_new_hugetlb_folio(struct folio *folio);
+extern void prep_and_add_allocated_folios(struct hstate *h,
+ struct list_head *folio_list);
+extern long demote_pool_huge_page(struct hstate *src,
+ nodemask_t *nodes_allowed,
+ unsigned long nr_to_demote);
+extern ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
+ struct hstate *h, int nid,
+ unsigned long count, size_t len);
+
+extern void hugetlb_sysfs_init(void) __init;
+
+#ifdef CONFIG_SYSCTL
+extern void hugetlb_sysctl_init(void);
+#else
+static inline void hugetlb_sysctl_init(void) { }
+#endif
+
+#endif /* _LINUX_HUGETLB_INTERNAL_H */
diff --git a/mm/hugetlb_sysctl.c b/mm/hugetlb_sysctl.c
new file mode 100644
index 000000000000..bd3077150542
--- /dev/null
+++ b/mm/hugetlb_sysctl.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * HugeTLB sysfs interfaces.
+ * (C) Nadia Yvette Chambers, April 2004
+ */
+
+#include <linux/sysctl.h>
+
+#include "hugetlb_internal.h"
+
+#ifdef CONFIG_SYSCTL
+static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write,
+ void *buffer, size_t *length,
+ loff_t *ppos, unsigned long *out)
+{
+ struct ctl_table dup_table;
+
+ /*
+ * In order to avoid races with __do_proc_doulongvec_minmax(), we
+ * can duplicate the @table and alter the duplicate of it.
+ */
+ dup_table = *table;
+ dup_table.data = out;
+
+ return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
+}
+
+static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
+ const struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp = h->max_huge_pages;
+ int ret;
+
+ if (!hugepages_supported())
+ return -EOPNOTSUPP;
+
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
+ if (ret)
+ goto out;
+
+ if (write)
+ ret = __nr_hugepages_store_common(obey_mempolicy, h,
+ NUMA_NO_NODE, tmp, *length);
+out:
+ return ret;
+}
+
+static int hugetlb_sysctl_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+
+ return hugetlb_sysctl_handler_common(false, table, write,
+ buffer, length, ppos);
+}
+
+#ifdef CONFIG_NUMA
+static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ return hugetlb_sysctl_handler_common(true, table, write,
+ buffer, length, ppos);
+}
+#endif /* CONFIG_NUMA */
+
+static int hugetlb_overcommit_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *length, loff_t *ppos)
+{
+ struct hstate *h = &default_hstate;
+ unsigned long tmp;
+ int ret;
+
+ if (!hugepages_supported())
+ return -EOPNOTSUPP;
+
+ tmp = h->nr_overcommit_huge_pages;
+
+ if (write && hstate_is_gigantic_no_runtime(h))
+ return -EINVAL;
+
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
+ if (ret)
+ goto out;
+
+ if (write) {
+ spin_lock_irq(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = tmp;
+ spin_unlock_irq(&hugetlb_lock);
+ }
+out:
+ return ret;
+}
+
+static const struct ctl_table hugetlb_table[] = {
+ {
+ .procname = "nr_hugepages",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = hugetlb_sysctl_handler,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "nr_hugepages_mempolicy",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &hugetlb_mempolicy_sysctl_handler,
+ },
+#endif
+ {
+ .procname = "hugetlb_shm_group",
+ .data = &sysctl_hugetlb_shm_group,
+ .maxlen = sizeof(gid_t),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "nr_overcommit_hugepages",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = hugetlb_overcommit_handler,
+ },
+};
+
+void __init hugetlb_sysctl_init(void)
+{
+ register_sysctl_init("vm", hugetlb_table);
+}
+#endif /* CONFIG_SYSCTL */
diff --git a/mm/hugetlb_sysfs.c b/mm/hugetlb_sysfs.c
new file mode 100644
index 000000000000..79ece91406bf
--- /dev/null
+++ b/mm/hugetlb_sysfs.c
@@ -0,0 +1,502 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * HugeTLB sysfs interfaces.
+ * (C) Nadia Yvette Chambers, April 2004
+ */
+
+#include <linux/swap.h>
+#include <linux/page_owner.h>
+#include <linux/page-isolation.h>
+
+#include "hugetlb_vmemmap.h"
+#include "hugetlb_internal.h"
+
+#define HSTATE_ATTR_RO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define HSTATE_ATTR_WO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
+
+#define HSTATE_ATTR(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
+
+static struct kobject *hugepages_kobj;
+static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
+
+static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
+{
+ int i;
+
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (hstate_kobjs[i] == kobj) {
+ if (nidp)
+ *nidp = NUMA_NO_NODE;
+ return &hstates[i];
+ }
+
+ return kobj_to_node_hstate(kobj, nidp);
+}
+
+static ssize_t nr_hugepages_show_common(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long nr_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ nr_huge_pages = h->nr_huge_pages;
+ else
+ nr_huge_pages = h->nr_huge_pages_node[nid];
+
+ return sysfs_emit(buf, "%lu\n", nr_huge_pages);
+}
+
+static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
+ struct kobject *kobj, const char *buf,
+ size_t len)
+{
+ struct hstate *h;
+ unsigned long count;
+ int nid;
+ int err;
+
+ err = kstrtoul(buf, 10, &count);
+ if (err)
+ return err;
+
+ h = kobj_to_hstate(kobj, &nid);
+ return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
+}
+
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return nr_hugepages_show_common(kobj, attr, buf);
+}
+
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ return nr_hugepages_store_common(false, kobj, buf, len);
+}
+HSTATE_ATTR(nr_hugepages);
+
+#ifdef CONFIG_NUMA
+
+/*
+ * hstate attribute for optionally mempolicy-based constraint on persistent
+ * huge page alloc/free.
+ */
+static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return nr_hugepages_show_common(kobj, attr, buf);
+}
+
+static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ return nr_hugepages_store_common(true, kobj, buf, len);
+}
+HSTATE_ATTR(nr_hugepages_mempolicy);
+#endif
+
+
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+ return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long input;
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+
+ if (hstate_is_gigantic_no_runtime(h))
+ return -EINVAL;
+
+ err = kstrtoul(buf, 10, &input);
+ if (err)
+ return err;
+
+ spin_lock_irq(&hugetlb_lock);
+ h->nr_overcommit_huge_pages = input;
+ spin_unlock_irq(&hugetlb_lock);
+
+ return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+
+static ssize_t free_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long free_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ free_huge_pages = h->free_huge_pages;
+ else
+ free_huge_pages = h->free_huge_pages_node[nid];
+
+ return sysfs_emit(buf, "%lu\n", free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+ return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h;
+ unsigned long surplus_huge_pages;
+ int nid;
+
+ h = kobj_to_hstate(kobj, &nid);
+ if (nid == NUMA_NO_NODE)
+ surplus_huge_pages = h->surplus_huge_pages;
+ else
+ surplus_huge_pages = h->surplus_huge_pages_node[nid];
+
+ return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+
+static ssize_t demote_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ unsigned long nr_demote;
+ unsigned long nr_available;
+ nodemask_t nodes_allowed, *n_mask;
+ struct hstate *h;
+ int err;
+ int nid;
+
+ err = kstrtoul(buf, 10, &nr_demote);
+ if (err)
+ return err;
+ h = kobj_to_hstate(kobj, &nid);
+
+ if (nid != NUMA_NO_NODE) {
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ } else {
+ n_mask = &node_states[N_MEMORY];
+ }
+
+ /* Synchronize with other sysfs operations modifying huge pages */
+ mutex_lock(&h->resize_lock);
+ spin_lock_irq(&hugetlb_lock);
+
+ while (nr_demote) {
+ long rc;
+
+ /*
+ * Check for available pages to demote each time thorough the
+ * loop as demote_pool_huge_page will drop hugetlb_lock.
+ */
+ if (nid != NUMA_NO_NODE)
+ nr_available = h->free_huge_pages_node[nid];
+ else
+ nr_available = h->free_huge_pages;
+ nr_available -= h->resv_huge_pages;
+ if (!nr_available)
+ break;
+
+ rc = demote_pool_huge_page(h, n_mask, nr_demote);
+ if (rc < 0) {
+ err = rc;
+ break;
+ }
+
+ nr_demote -= rc;
+ }
+
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
+
+ if (err)
+ return err;
+ return len;
+}
+HSTATE_ATTR_WO(demote);
+
+static ssize_t demote_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
+ unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
+
+ return sysfs_emit(buf, "%lukB\n", demote_size);
+}
+
+static ssize_t demote_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hstate *h, *demote_hstate;
+ unsigned long demote_size;
+ unsigned int demote_order;
+
+ demote_size = (unsigned long)memparse(buf, NULL);
+
+ demote_hstate = size_to_hstate(demote_size);
+ if (!demote_hstate)
+ return -EINVAL;
+ demote_order = demote_hstate->order;
+ if (demote_order < HUGETLB_PAGE_ORDER)
+ return -EINVAL;
+
+ /* demote order must be smaller than hstate order */
+ h = kobj_to_hstate(kobj, NULL);
+ if (demote_order >= h->order)
+ return -EINVAL;
+
+ /* resize_lock synchronizes access to demote size and writes */
+ mutex_lock(&h->resize_lock);
+ h->demote_order = demote_order;
+ mutex_unlock(&h->resize_lock);
+
+ return count;
+}
+HSTATE_ATTR(demote_size);
+
+static struct attribute *hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &nr_overcommit_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &resv_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+#ifdef CONFIG_NUMA
+ &nr_hugepages_mempolicy_attr.attr,
+#endif
+ NULL,
+};
+
+static const struct attribute_group hstate_attr_group = {
+ .attrs = hstate_attrs,
+};
+
+static struct attribute *hstate_demote_attrs[] = {
+ &demote_size_attr.attr,
+ &demote_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group hstate_demote_attr_group = {
+ .attrs = hstate_demote_attrs,
+};
+
+static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
+ struct kobject **hstate_kobjs,
+ const struct attribute_group *hstate_attr_group)
+{
+ int retval;
+ int hi = hstate_index(h);
+
+ hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
+ if (!hstate_kobjs[hi])
+ return -ENOMEM;
+
+ retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
+ if (retval) {
+ kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ return retval;
+ }
+
+ if (h->demote_order) {
+ retval = sysfs_create_group(hstate_kobjs[hi],
+ &hstate_demote_attr_group);
+ if (retval) {
+ pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+ sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
+ kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ return retval;
+ }
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_NUMA
+static bool hugetlb_sysfs_initialized __ro_after_init;
+
+/*
+ * node_hstate/s - associate per node hstate attributes, via their kobjects,
+ * with node devices in node_devices[] using a parallel array. The array
+ * index of a node device or _hstate == node id.
+ * This is here to avoid any static dependency of the node device driver, in
+ * the base kernel, on the hugetlb module.
+ */
+struct node_hstate {
+ struct kobject *hugepages_kobj;
+ struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+};
+static struct node_hstate node_hstates[MAX_NUMNODES];
+
+/*
+ * A subset of global hstate attributes for node devices
+ */
+static struct attribute *per_node_hstate_attrs[] = {
+ &nr_hugepages_attr.attr,
+ &free_hugepages_attr.attr,
+ &surplus_hugepages_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group per_node_hstate_attr_group = {
+ .attrs = per_node_hstate_attrs,
+};
+
+/*
+ * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
+ * Returns node id via non-NULL nidp.
+ */
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++) {
+ struct node_hstate *nhs = &node_hstates[nid];
+ int i;
+ for (i = 0; i < HUGE_MAX_HSTATE; i++)
+ if (nhs->hstate_kobjs[i] == kobj) {
+ if (nidp)
+ *nidp = nid;
+ return &hstates[i];
+ }
+ }
+
+ BUG();
+ return NULL;
+}
+
+/*
+ * Unregister hstate attributes from a single node device.
+ * No-op if no hstate attributes attached.
+ */
+void hugetlb_unregister_node(struct node *node)
+{
+ struct hstate *h;
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
+
+ if (!nhs->hugepages_kobj)
+ return; /* no hstate attributes */
+
+ for_each_hstate(h) {
+ int idx = hstate_index(h);
+ struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
+
+ if (!hstate_kobj)
+ continue;
+ if (h->demote_order)
+ sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
+ sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
+ kobject_put(hstate_kobj);
+ nhs->hstate_kobjs[idx] = NULL;
+ }
+
+ kobject_put(nhs->hugepages_kobj);
+ nhs->hugepages_kobj = NULL;
+}
+
+
+/*
+ * Register hstate attributes for a single node device.
+ * No-op if attributes already registered.
+ */
+void hugetlb_register_node(struct node *node)
+{
+ struct hstate *h;
+ struct node_hstate *nhs = &node_hstates[node->dev.id];
+ int err;
+
+ if (!hugetlb_sysfs_initialized)
+ return;
+
+ if (nhs->hugepages_kobj)
+ return; /* already allocated */
+
+ nhs->hugepages_kobj = kobject_create_and_add("hugepages",
+ &node->dev.kobj);
+ if (!nhs->hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
+ nhs->hstate_kobjs,
+ &per_node_hstate_attr_group);
+ if (err) {
+ pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
+ h->name, node->dev.id);
+ hugetlb_unregister_node(node);
+ break;
+ }
+ }
+}
+
+/*
+ * hugetlb init time: register hstate attributes for all registered node
+ * devices of nodes that have memory. All on-line nodes should have
+ * registered their associated device by this time.
+ */
+static void __init hugetlb_register_all_nodes(void)
+{
+ int nid;
+
+ for_each_online_node(nid)
+ hugetlb_register_node(node_devices[nid]);
+}
+#else /* !CONFIG_NUMA */
+
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+ BUG();
+ if (nidp)
+ *nidp = -1;
+ return NULL;
+}
+
+static void hugetlb_register_all_nodes(void) { }
+
+#endif
+
+void __init hugetlb_sysfs_init(void)
+{
+ struct hstate *h;
+ int err;
+
+ hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+ if (!hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+ hstate_kobjs, &hstate_attr_group);
+ if (err)
+ pr_err("HugeTLB: Unable to add hstate %s\n", h->name);
+ }
+
+#ifdef CONFIG_NUMA
+ hugetlb_sysfs_initialized = true;
+#endif
+ hugetlb_register_all_nodes();
+}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index ba0fb1b6a5a8..9d01f883fd71 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -15,7 +15,8 @@
#include <linux/bootmem_info.h>
#include <linux/mmdebug.h>
#include <linux/pagewalk.h>
-#include <asm/pgalloc.h>
+#include <linux/pgalloc.h>
+
#include <asm/tlbflush.h>
#include "hugetlb_vmemmap.h"
@@ -75,7 +76,7 @@ static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
if (likely(pmd_leaf(*pmd))) {
/*
* Higher order allocations from buddy allocator must be able to
- * be treated as indepdenent small pages (as they can be freed
+ * be treated as independent small pages (as they can be freed
* individually).
*/
if (!PageReserved(head))
@@ -684,7 +685,7 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
ret = hugetlb_vmemmap_split_folio(h, folio);
/*
- * Spliting the PMD requires allocating a page, thus lets fail
+ * Splitting the PMD requires allocating a page, thus let's fail
* early once we encounter the first OOM. No point in retrying
* as it can be dynamically done on remap with the memory
* we get back from the vmemmap deduplication.
@@ -715,7 +716,7 @@ static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
/*
* Pages to be freed may have been accumulated. If we
* encounter an ENOMEM, free what we have and try again.
- * This can occur in the case that both spliting fails
+ * This can occur in the case that both splitting fails
* halfway and head page allocation also failed. In this
* case __hugetlb_vmemmap_optimize_folio() would free memory
* allowing more vmemmap remaps to occur.
diff --git a/mm/internal.h b/mm/internal.h
index 27ad37a41868..89790def1bae 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -15,7 +15,7 @@
#include <linux/pagewalk.h>
#include <linux/rmap.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/swap_cgroup.h>
#include <linux/tracepoint-defs.h>
@@ -325,8 +325,7 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
/**
* pte_move_swp_offset - Move the swap entry offset field of a swap pte
* forward or backward by delta
- * @pte: The initial pte state; is_swap_pte(pte) must be true and
- * non_swap_entry() must be false.
+ * @pte: The initial pte state; must be a swap entry
* @delta: The direction and the offset we are moving; forward if delta
* is positive; backward if delta is negative
*
@@ -335,7 +334,7 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte,
*/
static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
{
- swp_entry_t entry = pte_to_swp_entry(pte);
+ const softleaf_t entry = softleaf_from_pte(pte);
pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
(swp_offset(entry) + delta)));
@@ -352,8 +351,7 @@ static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
/**
* pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
- * @pte: The initial pte state; is_swap_pte(pte) must be true and
- * non_swap_entry() must be false.
+ * @pte: The initial pte state; must be a swap entry.
*
* Increments the swap offset, while maintaining all other fields, including
* swap type, and any swp pte bits. The resulting pte is returned.
@@ -382,21 +380,23 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
{
pte_t expected_pte = pte_next_swp_offset(pte);
const pte_t *end_ptep = start_ptep + max_nr;
- swp_entry_t entry = pte_to_swp_entry(pte);
+ const softleaf_t entry = softleaf_from_pte(pte);
pte_t *ptep = start_ptep + 1;
unsigned short cgroup_id;
VM_WARN_ON(max_nr < 1);
- VM_WARN_ON(!is_swap_pte(pte));
- VM_WARN_ON(non_swap_entry(entry));
+ VM_WARN_ON(!softleaf_is_swap(entry));
cgroup_id = lookup_swap_cgroup_id(entry);
while (ptep < end_ptep) {
+ softleaf_t entry;
+
pte = ptep_get(ptep);
if (!pte_same(pte, expected_pte))
break;
- if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id)
+ entry = softleaf_from_pte(pte);
+ if (lookup_swap_cgroup_id(entry) != cgroup_id)
break;
expected_pte = pte_next_swp_offset(expected_pte);
ptep++;
@@ -1355,7 +1355,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
#ifdef CONFIG_MMU
void __init vmalloc_init(void);
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
- pgprot_t prot, struct page **pages, unsigned int page_shift);
+ pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask);
unsigned int get_vm_area_page_order(struct vm_struct *vm);
#else
static inline void vmalloc_init(void)
@@ -1364,7 +1364,7 @@ static inline void vmalloc_init(void)
static inline
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
- pgprot_t prot, struct page **pages, unsigned int page_shift)
+ pgprot_t prot, struct page **pages, unsigned int page_shift, gfp_t gfp_mask)
{
return -EINVAL;
}
@@ -1378,6 +1378,26 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
void __vunmap_range_noflush(unsigned long start, unsigned long end);
+static inline bool vma_is_single_threaded_private(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHARED)
+ return false;
+
+ return atomic_read(&vma->vm_mm->mm_users) == 1;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
+ bool is_private_single_threaded);
+
+#else
+static inline bool folio_can_map_prot_numa(struct folio *folio,
+ struct vm_area_struct *vma, bool is_private_single_threaded)
+{
+ return false;
+}
+#endif
+
int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
unsigned long addr, int *flags, bool writable,
int *last_cpupid);
@@ -1534,7 +1554,7 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
* VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
* will be constantly true.
*/
- if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ if (!pgtable_supports_soft_dirty())
return false;
/*
@@ -1629,7 +1649,10 @@ static inline void accept_page(struct page *page)
#endif /* CONFIG_UNACCEPTED_MEMORY */
/* pagewalk.c */
-int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
+int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private);
+int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private);
int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
@@ -1657,4 +1680,26 @@ static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm);
int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm);
+void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn);
+int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t pgprot);
+
+static inline void io_remap_pfn_range_prepare(struct vm_area_desc *desc,
+ unsigned long orig_pfn, unsigned long size)
+{
+ const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
+
+ return remap_pfn_range_prepare(desc, pfn);
+}
+
+static inline int io_remap_pfn_range_complete(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long orig_pfn, unsigned long size,
+ pgprot_t orig_prot)
+{
+ const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
+ const pgprot_t prot = pgprot_decrypted(orig_prot);
+
+ return remap_pfn_range_complete(vma, addr, pfn, size, prot);
+}
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 38e8bb0bf326..1d27f1bd260b 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -305,9 +305,6 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init,
static inline bool check_page_allocation(void *ptr, unsigned long ip)
{
- if (!kasan_enabled())
- return false;
-
if (ptr != page_address(virt_to_head_page(ptr))) {
kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE);
return true;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index b413c46b3e04..2b8e73f5f6a7 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -506,9 +506,6 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta)
static void release_free_meta(const void *object, struct kasan_free_meta *meta)
{
- if (!kasan_enabled())
- return;
-
/* Check if free meta is valid. */
if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META)
return;
@@ -573,7 +570,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
kasan_save_track(&alloc_meta->alloc_track, flags);
}
-void __kasan_save_free_info(struct kmem_cache *cache, void *object)
+void kasan_save_free_info(struct kmem_cache *cache, void *object)
{
struct kasan_free_meta *free_meta;
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 07fa7375a848..fc9169a54766 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -399,12 +399,7 @@ void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack);
void kasan_save_track(struct kasan_track *track, gfp_t flags);
void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags);
-void __kasan_save_free_info(struct kmem_cache *cache, void *object);
-static inline void kasan_save_free_info(struct kmem_cache *cache, void *object)
-{
- if (kasan_enabled())
- __kasan_save_free_info(cache, object);
-}
+void kasan_save_free_info(struct kmem_cache *cache, void *object);
#ifdef CONFIG_KASAN_GENERIC
bool kasan_quarantine_put(struct kmem_cache *cache, void *object);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 5d2a876035d6..29a751a8a08d 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -354,7 +354,7 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask
return 0;
}
-static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask)
+static int __kasan_populate_vmalloc_do(unsigned long start, unsigned long end, gfp_t gfp_mask)
{
unsigned long nr_pages, nr_total = PFN_UP(end - start);
struct vmalloc_populate_data data;
@@ -377,18 +377,10 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_
* page tables allocations ignore external gfp mask, enforce it
* by the scope API
*/
- if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
- flags = memalloc_nofs_save();
- else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
- flags = memalloc_noio_save();
-
+ flags = memalloc_apply_gfp_scope(gfp_mask);
ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
kasan_populate_vmalloc_pte, &data);
-
- if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
- memalloc_nofs_restore(flags);
- else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
- memalloc_noio_restore(flags);
+ memalloc_restore_scope(flags);
___free_pages_bulk(data.pages, nr_pages);
if (ret)
@@ -403,14 +395,11 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_
return ret;
}
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask)
+int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask)
{
unsigned long shadow_start, shadow_end;
int ret;
- if (!kasan_enabled())
- return 0;
-
if (!is_vmalloc_or_module_addr((void *)addr))
return 0;
@@ -432,7 +421,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mas
shadow_start = PAGE_ALIGN_DOWN(shadow_start);
shadow_end = PAGE_ALIGN(shadow_end);
- ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask);
+ ret = __kasan_populate_vmalloc_do(shadow_start, shadow_end, gfp_mask);
if (ret)
return ret;
@@ -574,7 +563,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
* pages entirely covered by the free region, we will not run in to any
* trouble - any simultaneous allocations will be for disjoint regions.
*/
-void kasan_release_vmalloc(unsigned long start, unsigned long end,
+void __kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long free_region_start,
unsigned long free_region_end,
unsigned long flags)
@@ -583,9 +572,6 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long region_start, region_end;
unsigned long size;
- if (!kasan_enabled())
- return;
-
region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE);
region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE);
@@ -634,9 +620,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
* with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
*/
- if (!kasan_enabled())
- return (void *)start;
-
if (!is_vmalloc_or_module_addr(start))
return (void *)start;
@@ -659,9 +642,6 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
*/
void __kasan_poison_vmalloc(const void *start, unsigned long size)
{
- if (!kasan_enabled())
- return;
-
if (!is_vmalloc_or_module_addr(start))
return;
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index b9f31293622b..d65d48b85f90 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -142,7 +142,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
save_stack_info(cache, object, flags, false);
}
-void __kasan_save_free_info(struct kmem_cache *cache, void *object)
+void kasan_save_free_info(struct kmem_cache *cache, void *object)
{
save_stack_info(cache, object, 0, true);
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index e62b5516bf48..577a1699c553 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -26,6 +26,7 @@
#include <linux/panic_notifier.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
+#include <linux/reboot.h>
#include <linux/sched/clock.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -822,6 +823,25 @@ static struct notifier_block kfence_check_canary_notifier = {
static struct delayed_work kfence_timer;
#ifdef CONFIG_KFENCE_STATIC_KEYS
+static int kfence_reboot_callback(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ /*
+ * Disable kfence to avoid static keys IPI synchronization during
+ * late shutdown/kexec
+ */
+ WRITE_ONCE(kfence_enabled, false);
+ /* Cancel any pending timer work */
+ cancel_delayed_work_sync(&kfence_timer);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kfence_reboot_notifier = {
+ .notifier_call = kfence_reboot_callback,
+ .priority = INT_MAX, /* Run early to stop timers ASAP */
+};
+
/* Wait queue to wake up allocation-gate timer task. */
static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
@@ -903,6 +923,10 @@ static void kfence_init_enable(void)
if (kfence_check_on_panic)
atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier);
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+ register_reboot_notifier(&kfence_reboot_notifier);
+#endif
+
WRITE_ONCE(kfence_enabled, true);
queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index abe54f0043c7..97d1b2824386 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -17,21 +17,20 @@
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate_wait.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/shmem_fs.h>
#include <linux/dax.h>
#include <linux/ksm.h>
+#include <linux/pgalloc.h>
#include <asm/tlb.h>
-#include <asm/pgalloc.h>
#include "internal.h"
#include "mm_slot.h"
enum scan_result {
SCAN_FAIL,
SCAN_SUCCEED,
- SCAN_PMD_NULL,
- SCAN_PMD_NONE,
+ SCAN_NO_PTE_TABLE,
SCAN_PMD_MAPPED,
SCAN_EXCEED_NONE_PTE,
SCAN_EXCEED_SWAP_PTE,
@@ -67,7 +66,7 @@ enum scan_result {
static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
-/* default scan 8*512 pte (or vmas) every 30 second */
+/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
@@ -129,9 +128,8 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}
-static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t __sleep_millisecs_store(const char *buf, size_t count,
+ unsigned int *millisecs)
{
unsigned int msecs;
int err;
@@ -140,12 +138,19 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
if (err)
return -EINVAL;
- khugepaged_scan_sleep_millisecs = msecs;
+ *millisecs = msecs;
khugepaged_sleep_expire = 0;
wake_up_interruptible(&khugepaged_wait);
return count;
}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs);
+}
static struct kobj_attribute scan_sleep_millisecs_attr =
__ATTR_RW(scan_sleep_millisecs);
@@ -160,18 +165,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned int msecs;
- int err;
-
- err = kstrtouint(buf, 10, &msecs);
- if (err)
- return -EINVAL;
-
- khugepaged_alloc_sleep_millisecs = msecs;
- khugepaged_sleep_expire = 0;
- wake_up_interruptible(&khugepaged_wait);
-
- return count;
+ return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs);
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
__ATTR_RW(alloc_sleep_millisecs);
@@ -337,6 +331,13 @@ struct attribute_group khugepaged_attr_group = {
};
#endif /* CONFIG_SYSFS */
+static bool pte_none_or_zero(pte_t pte)
+{
+ if (pte_none(pte))
+ return true;
+ return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
+}
+
int hugepage_madvise(struct vm_area_struct *vma,
vm_flags_t *vm_flags, int advice)
{
@@ -518,6 +519,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
if (pte_none(pteval))
continue;
+ VM_WARN_ON_ONCE(!pte_present(pteval));
pfn = pte_pfn(pteval);
if (is_zero_pfn(pfn))
continue;
@@ -548,8 +550,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
- if (pte_none(pteval) || (pte_present(pteval) &&
- is_zero_pfn(pte_pfn(pteval)))) {
+ if (pte_none_or_zero(pteval)) {
++none_or_zero;
if (!userfaultfd_armed(vma) &&
(!cc->is_khugepaged ||
@@ -690,17 +691,17 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
address += nr_ptes * PAGE_SIZE) {
nr_ptes = 1;
pteval = ptep_get(_pte);
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (pte_none_or_zero(pteval)) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
- if (is_zero_pfn(pte_pfn(pteval))) {
- /*
- * ptl mostly unnecessary.
- */
- spin_lock(ptl);
- ptep_clear(vma->vm_mm, address, _pte);
- spin_unlock(ptl);
- ksm_might_unmap_zero_page(vma->vm_mm, pteval);
- }
+ if (pte_none(pteval))
+ continue;
+ /*
+ * ptl mostly unnecessary.
+ */
+ spin_lock(ptl);
+ ptep_clear(vma->vm_mm, address, _pte);
+ spin_unlock(ptl);
+ ksm_might_unmap_zero_page(vma->vm_mm, pteval);
} else {
struct page *src_page = pte_page(pteval);
@@ -794,7 +795,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
unsigned long src_addr = address + i * PAGE_SIZE;
struct page *src_page;
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (pte_none_or_zero(pteval)) {
clear_user_highpage(page, src_addr);
continue;
}
@@ -932,21 +933,21 @@ static inline int check_pmd_state(pmd_t *pmd)
pmd_t pmde = pmdp_get_lockless(pmd);
if (pmd_none(pmde))
- return SCAN_PMD_NONE;
+ return SCAN_NO_PTE_TABLE;
/*
* The folio may be under migration when khugepaged is trying to
* collapse it. Migration success or failure will eventually end
* up with a present PMD mapping a folio again.
*/
- if (is_pmd_migration_entry(pmde))
+ if (pmd_is_migration_entry(pmde))
return SCAN_PMD_MAPPED;
if (!pmd_present(pmde))
- return SCAN_PMD_NULL;
+ return SCAN_NO_PTE_TABLE;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
if (pmd_bad(pmde))
- return SCAN_PMD_NULL;
+ return SCAN_NO_PTE_TABLE;
return SCAN_SUCCEED;
}
@@ -956,7 +957,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
{
*pmd = mm_find_pmd(mm, address);
if (!*pmd)
- return SCAN_PMD_NULL;
+ return SCAN_NO_PTE_TABLE;
return check_pmd_state(*pmd);
}
@@ -1011,13 +1012,14 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
if (!pte) {
mmap_read_unlock(mm);
- result = SCAN_PMD_NULL;
+ result = SCAN_NO_PTE_TABLE;
goto out;
}
}
vmf.orig_pte = ptep_get_lockless(pte);
- if (!is_swap_pte(vmf.orig_pte))
+ if (pte_none(vmf.orig_pte) ||
+ pte_present(vmf.orig_pte))
continue;
vmf.pte = pte;
@@ -1184,7 +1186,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
&compound_pagelist);
spin_unlock(pte_ptl);
} else {
- result = SCAN_PMD_NULL;
+ result = SCAN_NO_PTE_TABLE;
}
if (unlikely(result != SCAN_SUCCEED)) {
@@ -1224,17 +1226,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
- _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
- _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
- folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
- set_pmd_at(mm, address, pmd, _pmd);
- update_mmu_cache_pmd(vma, address, pmd);
- deferred_split_folio(folio, false);
+ map_anon_folio_pmd_nopf(folio, pmd, vma, address);
spin_unlock(pmd_ptl);
folio = NULL;
@@ -1274,14 +1269,26 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
nodes_clear(cc->alloc_nmask);
pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
- result = SCAN_PMD_NULL;
+ result = SCAN_NO_PTE_TABLE;
goto out;
}
for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
- if (is_swap_pte(pteval)) {
+ if (pte_none_or_zero(pteval)) {
+ ++none_or_zero;
+ if (!userfaultfd_armed(vma) &&
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ goto out_unmap;
+ }
+ }
+ if (!pte_present(pteval)) {
++unmapped;
if (!cc->is_khugepaged ||
unmapped <= khugepaged_max_ptes_swap) {
@@ -1301,18 +1308,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
}
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- ++none_or_zero;
- if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
- continue;
- } else {
- result = SCAN_EXCEED_NONE_PTE;
- count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
- goto out_unmap;
- }
- }
if (pte_uffd_wp(pteval)) {
/*
* Don't collapse the page if any of the small
@@ -1548,8 +1543,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
switch (result) {
case SCAN_SUCCEED:
break;
- case SCAN_PMD_NULL:
- case SCAN_PMD_NONE:
+ case SCAN_NO_PTE_TABLE:
/*
* All pte entries have been removed and pmd cleared.
* Skip all the pte checks and just update the pmd mapping.
@@ -1715,6 +1709,43 @@ drop_folio:
return result;
}
+/* Can we retract page tables for this file-backed VMA? */
+static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
+{
+ /*
+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+ * got written to. These VMAs are likely not worth removing
+ * page tables from, as PMD-mapping is likely to be split later.
+ */
+ if (READ_ONCE(vma->anon_vma))
+ return false;
+
+ /*
+ * When a vma is registered with uffd-wp, we cannot recycle
+ * the page table because there may be pte markers installed.
+ * Other vmas can still have the same file mapped hugely, but
+ * skip this one: it will always be mapped in small page size
+ * for uffd-wp registered ranges.
+ */
+ if (userfaultfd_wp(vma))
+ return false;
+
+ /*
+ * If the VMA contains guard regions then we can't collapse it.
+ *
+ * This is set atomically on guard marker installation under mmap/VMA
+ * read lock, and here we may not hold any VMA or mmap lock at all.
+ *
+ * This is therefore serialised on the PTE page table lock, which is
+ * obtained on guard region installation after the flag is set, so this
+ * check being performed under this lock excludes races.
+ */
+ if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT))
+ return false;
+
+ return true;
+}
+
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
@@ -1729,14 +1760,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spinlock_t *ptl;
bool success = false;
- /*
- * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
- * got written to. These VMAs are likely not worth removing
- * page tables from, as PMD-mapping is likely to be split later.
- */
- if (READ_ONCE(vma->anon_vma))
- continue;
-
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (addr & ~HPAGE_PMD_MASK ||
vma->vm_end < addr + HPAGE_PMD_SIZE)
@@ -1748,14 +1771,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
if (hpage_collapse_test_exit(mm))
continue;
- /*
- * When a vma is registered with uffd-wp, we cannot recycle
- * the page table because there may be pte markers installed.
- * Other vmas can still have the same file mapped hugely, but
- * skip this one: it will always be mapped in small page size
- * for uffd-wp registered ranges.
- */
- if (userfaultfd_wp(vma))
+
+ if (!file_backed_vma_is_retractable(vma))
continue;
/* PTEs were notified when unmapped; but now for the PMD? */
@@ -1782,15 +1799,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
/*
- * Huge page lock is still held, so normally the page table
- * must remain empty; and we have already skipped anon_vma
- * and userfaultfd_wp() vmas. But since the mmap_lock is not
- * held, it is still possible for a racing userfaultfd_ioctl()
- * to have inserted ptes or markers. Now that we hold ptlock,
- * repeating the anon_vma check protects from one category,
- * and repeating the userfaultfd_wp() check from another.
+ * Huge page lock is still held, so normally the page table must
+ * remain empty; and we have already skipped anon_vma and
+ * userfaultfd_wp() vmas. But since the mmap_lock is not held,
+ * it is still possible for a racing userfaultfd_ioctl() or
+ * madvise() to have inserted ptes or markers. Now that we hold
+ * ptlock, repeating the retractable checks protects us from
+ * races against the prior checks.
*/
- if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
+ if (likely(file_backed_vma_is_retractable(vma))) {
pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
pmdp_get_lockless_sync();
success = true;
@@ -2178,14 +2195,14 @@ immap_locked:
}
if (is_shmem)
- __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
+ lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
else
- __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
+ lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
if (nr_none) {
- __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
+ lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
/* nr_none is always 0 for non-shmem. */
- __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
+ lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
}
/*
@@ -2784,8 +2801,6 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
- memset(cc->node_load, 0, sizeof(cc->node_load));
- nodes_clear(cc->alloc_nmask);
if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
@@ -2815,7 +2830,7 @@ handle_result:
mmap_read_unlock(mm);
goto handle_result;
/* Whitelisted set of results where continuing OK */
- case SCAN_PMD_NULL:
+ case SCAN_NO_PTE_TABLE:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
case SCAN_LACK_REFERENCED_PAGE:
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index 35ceaa8adb41..90f427b95a21 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -33,7 +33,7 @@ bool kmsan_enabled __read_mostly;
/*
* Per-CPU KMSAN context to be used in interrupts, where current->kmsan is
- * unavaliable.
+ * unavailable.
*/
DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 55fdea199aaf..e7f554a31bb4 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -215,7 +215,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages,
- unsigned int page_shift)
+ unsigned int page_shift, gfp_t gfp_mask)
{
unsigned long shadow_start, origin_start, shadow_end, origin_end;
struct page **s_pages, **o_pages;
@@ -230,8 +230,8 @@ int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
return 0;
nr = (end - start) / PAGE_SIZE;
- s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL);
- o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL);
+ s_pages = kcalloc(nr, sizeof(*s_pages), gfp_mask);
+ o_pages = kcalloc(nr, sizeof(*o_pages), gfp_mask);
if (!s_pages || !o_pages) {
err = -ENOMEM;
goto ret;
diff --git a/mm/ksm.c b/mm/ksm.c
index c4e730409949..cfc182255c7b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -389,7 +389,7 @@ static unsigned long ewma(unsigned long prev, unsigned long curr)
* exponentially weighted moving average. The new pages_to_scan value is
* multiplied with that change factor:
*
- * new_pages_to_scan *= change facor
+ * new_pages_to_scan *= change factor
*
* The new_pages_to_scan value is limited by the cpu min and max values. It
* calculates the cpu percent for the last scan and calculates the new
@@ -607,7 +607,76 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
+static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ unsigned long *found_addr = (unsigned long *) walk->private;
+ struct mm_struct *mm = walk->mm;
+ pte_t *start_ptep, *ptep;
+ spinlock_t *ptl;
+ int found = 0;
+
+ if (ksm_test_exit(walk->mm))
+ return 0;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!start_ptep)
+ return 0;
+
+ for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+ pte_t pte = ptep_get(ptep);
+ struct folio *folio = NULL;
+
+ if (pte_present(pte)) {
+ folio = vm_normal_folio(walk->vma, addr, pte);
+ } else if (!pte_none(pte)) {
+ const softleaf_t entry = softleaf_from_pte(pte);
+
+ /*
+ * As KSM pages remain KSM pages until freed, no need to wait
+ * here for migration to end.
+ */
+ if (softleaf_is_migration(entry))
+ folio = softleaf_to_folio(entry);
+ }
+ /* return 1 if the page is an normal ksm page or KSM-placed zero page */
+ found = (folio && folio_test_ksm(folio)) ||
+ (pte_present(pte) && is_ksm_zero_pte(pte));
+ if (found) {
+ *found_addr = addr;
+ goto out_unlock;
+ }
+ }
+out_unlock:
+ pte_unmap_unlock(ptep, ptl);
+ return found;
+}
+
+static const struct mm_walk_ops break_ksm_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops break_ksm_lock_vma_ops = {
+ .pmd_entry = break_ksm_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK,
+};
+
/*
+ * Though it's very tempting to unmerge rmap_items from stable tree rather
+ * than check every pte of a given vma, the locking doesn't quite work for
+ * that - an rmap_item is assigned to the stable tree after inserting ksm
+ * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
+ * rmap_items from parent to child at fork time (so as not to waste time
+ * if exit comes before the next scan reaches it).
+ *
+ * Similarly, although we'd like to remove rmap_items (so updating counts
+ * and freeing memory) when unmerging an area, it's easier to leave that
+ * to the next pass of ksmd - consider, for example, how ksmd might be
+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
+ *
* We use break_ksm to break COW on a ksm page by triggering unsharing,
* such that the ksm page will get replaced by an exclusive anonymous page.
*
@@ -620,31 +689,20 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
* of the process that owns 'vma'. We also do not want to enforce
* protection keys here anyway.
*/
-static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long end, bool lock_vma)
{
vm_fault_t ret = 0;
-
- if (lock_vma)
- vma_start_write(vma);
+ const struct mm_walk_ops *ops = lock_vma ?
+ &break_ksm_lock_vma_ops : &break_ksm_ops;
do {
- bool ksm_page = false;
- struct folio_walk fw;
- struct folio *folio;
+ int ksm_page;
cond_resched();
- folio = folio_walk_start(&fw, vma, addr,
- FW_MIGRATION | FW_ZEROPAGE);
- if (folio) {
- /* Small folio implies FW_LEVEL_PTE. */
- if (!folio_test_large(folio) &&
- (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte)))
- ksm_page = true;
- folio_walk_end(&fw, vma);
- }
-
- if (!ksm_page)
- return 0;
+ ksm_page = walk_page_range_vma(vma, addr, end, ops, &addr);
+ if (ksm_page <= 0)
+ return ksm_page;
ret = handle_mm_fault(vma, addr,
FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
NULL);
@@ -730,7 +788,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
- break_ksm(vma, addr, false);
+ break_ksm(vma, addr, addr + PAGE_SIZE, false);
mmap_read_unlock(mm);
}
@@ -1025,36 +1083,6 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
}
}
-/*
- * Though it's very tempting to unmerge rmap_items from stable tree rather
- * than check every pte of a given vma, the locking doesn't quite work for
- * that - an rmap_item is assigned to the stable tree after inserting ksm
- * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
- * rmap_items from parent to child at fork time (so as not to waste time
- * if exit comes before the next scan reaches it).
- *
- * Similarly, although we'd like to remove rmap_items (so updating counts
- * and freeing memory) when unmerging an area, it's easier to leave that
- * to the next pass of ksmd - consider, for example, how ksmd might be
- * in cmp_and_merge_page on one of the rmap_items we would be removing.
- */
-static int unmerge_ksm_pages(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, bool lock_vma)
-{
- unsigned long addr;
- int err = 0;
-
- for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
- if (ksm_test_exit(vma->vm_mm))
- break;
- if (signal_pending(current))
- err = -ERESTARTSYS;
- else
- err = break_ksm(vma, addr, lock_vma);
- }
- return err;
-}
-
static inline
struct ksm_stable_node *folio_stable_node(const struct folio *folio)
{
@@ -1192,8 +1220,7 @@ static int unmerge_and_remove_all_rmap_items(void)
for_each_vma(vmi, vma) {
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
continue;
- err = unmerge_ksm_pages(vma,
- vma->vm_start, vma->vm_end, false);
+ err = break_ksm(vma, vma->vm_start, vma->vm_end, false);
if (err)
goto error;
}
@@ -2712,8 +2739,14 @@ no_vmas:
spin_unlock(&ksm_mmlist_lock);
mm_slot_free(mm_slot_cache, mm_slot);
+ /*
+ * Only clear MMF_VM_MERGEABLE. We must not clear
+ * MMF_VM_MERGE_ANY, because for those MMF_VM_MERGE_ANY process,
+ * perhaps their mm_struct has just been added to ksm_mm_slot
+ * list, and its process has not yet officially started running
+ * or has not yet performed mmap/brk to allocate anonymous VMAS.
+ */
mm_flags_clear(MMF_VM_MERGEABLE, mm);
- mm_flags_clear(MMF_VM_MERGE_ANY, mm);
mmap_read_unlock(mm);
mmdrop(mm);
} else {
@@ -2814,7 +2847,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
return 0;
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
+ err = break_ksm(vma, vma->vm_start, vma->vm_end, true);
if (err)
return err;
}
@@ -2831,12 +2864,20 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
*
* Returns: @vm_flags possibly updated to mark mergeable.
*/
-vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file,
vm_flags_t vm_flags)
{
if (mm_flags_test(MMF_VM_MERGE_ANY, mm) &&
- __ksm_should_add_vma(file, vm_flags))
+ __ksm_should_add_vma(file, vm_flags)) {
vm_flags |= VM_MERGEABLE;
+ /*
+ * Generally, the flags here always include MMF_VM_MERGEABLE.
+ * However, in rare cases, this flag may be cleared by ksmd who
+ * scans a cycle without finding any mergeable vma.
+ */
+ if (unlikely(!mm_flags_test(MMF_VM_MERGEABLE, mm)))
+ __ksm_enter(mm);
+ }
return vm_flags;
}
@@ -2958,7 +2999,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
return 0; /* just ignore the advice */
if (vma->anon_vma) {
- err = unmerge_ksm_pages(vma, start, end, true);
+ err = break_ksm(vma, start, end, true);
if (err)
return err;
}
@@ -3340,7 +3381,7 @@ static int ksm_memory_callback(struct notifier_block *self,
* Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
* and remove_all_stable_nodes() while memory is going offline:
* it is unsafe for them to touch the stable tree at this time.
- * But unmerge_ksm_pages(), rmap lookups and other entry points
+ * But break_ksm(), rmap lookups and other entry points
* which do not need the ksm_thread_mutex are all safe.
*/
mutex_lock(&ksm_thread_mutex);
diff --git a/mm/madvise.c b/mm/madvise.c
index fb1c86e630b6..b617b1be0f53 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -29,7 +29,7 @@
#include <linux/backing-dev.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
@@ -167,7 +167,7 @@ static int madvise_update_vma(vm_flags_t new_flags,
range->start, range->end, anon_name);
else
vma = vma_modify_flags(&vmi, madv_behavior->prev, vma,
- range->start, range->end, new_flags);
+ range->start, range->end, &new_flags);
if (IS_ERR(vma))
return PTR_ERR(vma);
@@ -195,7 +195,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
for (addr = start; addr < end; addr += PAGE_SIZE) {
pte_t pte;
- swp_entry_t entry;
+ softleaf_t entry;
struct folio *folio;
if (!ptep++) {
@@ -205,10 +205,8 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
}
pte = ptep_get(ptep);
- if (!is_swap_pte(pte))
- continue;
- entry = pte_to_swp_entry(pte);
- if (unlikely(non_swap_entry(entry)))
+ entry = softleaf_from_pte(pte);
+ if (unlikely(!softleaf_is_swap(entry)))
continue;
pte_unmap_unlock(ptep, ptl);
@@ -251,7 +249,7 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
continue;
entry = radix_to_swp_entry(folio);
/* There might be swapin error entries in shmem mapping. */
- if (non_swap_entry(entry))
+ if (!softleaf_is_swap(entry))
continue;
addr = vma->vm_start +
@@ -392,7 +390,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
if (unlikely(!pmd_present(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
+ !pmd_is_migration_entry(orig_pmd));
goto huge_unlock;
}
@@ -690,17 +688,16 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
* (page allocation + zeroing).
*/
if (!pte_present(ptent)) {
- swp_entry_t entry;
+ softleaf_t entry = softleaf_from_pte(ptent);
- entry = pte_to_swp_entry(ptent);
- if (!non_swap_entry(entry)) {
+ if (softleaf_is_swap(entry)) {
max_nr = (end - addr) / PAGE_SIZE;
nr = swap_pte_batch(pte, max_nr, ptent);
nr_swap -= nr;
free_swap_and_cache_nr(entry, nr);
clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
- } else if (is_hwpoison_entry(entry) ||
- is_poisoned_swp_entry(entry)) {
+ } else if (softleaf_is_hwpoison(entry) ||
+ softleaf_is_poison_marker(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
}
continue;
@@ -1071,8 +1068,9 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
static bool is_guard_pte_marker(pte_t ptent)
{
- return is_swap_pte(ptent) &&
- is_guard_swp_entry(pte_to_swp_entry(ptent));
+ const softleaf_t entry = softleaf_from_pte(ptent);
+
+ return softleaf_is_guard_marker(entry);
}
static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
@@ -1122,18 +1120,17 @@ static int guard_install_set_pte(unsigned long addr, unsigned long next,
return 0;
}
-static const struct mm_walk_ops guard_install_walk_ops = {
- .pud_entry = guard_install_pud_entry,
- .pmd_entry = guard_install_pmd_entry,
- .pte_entry = guard_install_pte_entry,
- .install_pte = guard_install_set_pte,
- .walk_lock = PGWALK_RDLOCK,
-};
-
static long madvise_guard_install(struct madvise_behavior *madv_behavior)
{
struct vm_area_struct *vma = madv_behavior->vma;
struct madvise_behavior_range *range = &madv_behavior->range;
+ struct mm_walk_ops walk_ops = {
+ .pud_entry = guard_install_pud_entry,
+ .pmd_entry = guard_install_pmd_entry,
+ .pte_entry = guard_install_pte_entry,
+ .install_pte = guard_install_set_pte,
+ .walk_lock = get_walk_lock(madv_behavior->lock_mode),
+ };
long err;
int i;
@@ -1141,24 +1138,38 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
return -EINVAL;
/*
- * If we install guard markers, then the range is no longer
- * empty from a page table perspective and therefore it's
- * appropriate to have an anon_vma.
+ * Set atomically under read lock. All pertinent readers will need to
+ * acquire an mmap/VMA write lock to read it. All remaining readers may
+ * or may not see the flag set, but we don't care.
+ */
+ vma_flag_set_atomic(vma, VMA_MAYBE_GUARD_BIT);
+
+ /*
+ * If anonymous and we are establishing page tables the VMA ought to
+ * have an anon_vma associated with it.
*
- * This ensures that on fork, we copy page tables correctly.
+ * We will hold an mmap read lock if this is necessary, this is checked
+ * as part of the VMA lock logic.
*/
- err = anon_vma_prepare(vma);
- if (err)
- return err;
+ if (vma_is_anonymous(vma)) {
+ VM_WARN_ON_ONCE(!vma->anon_vma &&
+ madv_behavior->lock_mode != MADVISE_MMAP_READ_LOCK);
+
+ err = anon_vma_prepare(vma);
+ if (err)
+ return err;
+ }
/*
* Optimistically try to install the guard marker pages first. If any
- * non-guard pages are encountered, give up and zap the range before
- * trying again.
+ * non-guard pages or THP huge pages are encountered, give up and zap
+ * the range before trying again.
*
* We try a few times before giving up and releasing back to userland to
- * loop around, releasing locks in the process to avoid contention. This
- * would only happen if there was a great many racing page faults.
+ * loop around, releasing locks in the process to avoid contention.
+ *
+ * This would only happen due to races with e.g. page faults or
+ * khugepaged.
*
* In most cases we should simply install the guard markers immediately
* with no zap or looping.
@@ -1167,8 +1178,13 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
unsigned long nr_pages = 0;
/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
- err = walk_page_range_mm(vma->vm_mm, range->start, range->end,
- &guard_install_walk_ops, &nr_pages);
+ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK)
+ err = walk_page_range_vma_unsafe(madv_behavior->vma,
+ range->start, range->end, &walk_ops,
+ &nr_pages);
+ else
+ err = walk_page_range_mm_unsafe(vma->vm_mm, range->start,
+ range->end, &walk_ops, &nr_pages);
if (err < 0)
return err;
@@ -1189,8 +1205,7 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
}
/*
- * We were unable to install the guard pages due to being raced by page
- * faults. This should not happen ordinarily. We return to userspace and
+ * We were unable to install the guard pages, return to userspace and
* immediately retry, relieving lock contention.
*/
return restart_syscall();
@@ -1234,17 +1249,16 @@ static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
return 0;
}
-static const struct mm_walk_ops guard_remove_walk_ops = {
- .pud_entry = guard_remove_pud_entry,
- .pmd_entry = guard_remove_pmd_entry,
- .pte_entry = guard_remove_pte_entry,
- .walk_lock = PGWALK_RDLOCK,
-};
-
static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
{
struct vm_area_struct *vma = madv_behavior->vma;
struct madvise_behavior_range *range = &madv_behavior->range;
+ struct mm_walk_ops wallk_ops = {
+ .pud_entry = guard_remove_pud_entry,
+ .pmd_entry = guard_remove_pmd_entry,
+ .pte_entry = guard_remove_pte_entry,
+ .walk_lock = get_walk_lock(madv_behavior->lock_mode),
+ };
/*
* We're ok with removing guards in mlock()'d ranges, as this is a
@@ -1254,7 +1268,7 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
return -EINVAL;
return walk_page_range_vma(vma, range->start, range->end,
- &guard_remove_walk_ops, NULL);
+ &wallk_ops, NULL);
}
#ifdef CONFIG_64BIT
@@ -1567,6 +1581,47 @@ static bool process_madvise_remote_valid(int behavior)
}
}
+/* Does this operation invoke anon_vma_prepare()? */
+static bool prepares_anon_vma(int behavior)
+{
+ switch (behavior) {
+ case MADV_GUARD_INSTALL:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * We have acquired a VMA read lock, is the VMA valid to be madvise'd under VMA
+ * read lock only now we have a VMA to examine?
+ */
+static bool is_vma_lock_sufficient(struct vm_area_struct *vma,
+ struct madvise_behavior *madv_behavior)
+{
+ /* Must span only a single VMA.*/
+ if (madv_behavior->range.end > vma->vm_end)
+ return false;
+ /* Remote processes unsupported. */
+ if (current->mm != vma->vm_mm)
+ return false;
+ /* Userfaultfd unsupported. */
+ if (userfaultfd_armed(vma))
+ return false;
+ /*
+ * anon_vma_prepare() explicitly requires an mmap lock for
+ * serialisation, so we cannot use a VMA lock in this case.
+ *
+ * Note we might race with anon_vma being set, however this makes this
+ * check overly paranoid which is safe.
+ */
+ if (vma_is_anonymous(vma) &&
+ prepares_anon_vma(madv_behavior->behavior) && !vma->anon_vma)
+ return false;
+
+ return true;
+}
+
/*
* Try to acquire a VMA read lock if possible.
*
@@ -1588,15 +1643,12 @@ static bool try_vma_read_lock(struct madvise_behavior *madv_behavior)
vma = lock_vma_under_rcu(mm, madv_behavior->range.start);
if (!vma)
goto take_mmap_read_lock;
- /*
- * Must span only a single VMA; uffd and remote processes are
- * unsupported.
- */
- if (madv_behavior->range.end > vma->vm_end || current->mm != mm ||
- userfaultfd_armed(vma)) {
+
+ if (!is_vma_lock_sufficient(vma, madv_behavior)) {
vma_end_read(vma);
goto take_mmap_read_lock;
}
+
madv_behavior->vma = vma;
return true;
@@ -1709,9 +1761,9 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
+ return MADVISE_MMAP_READ_LOCK;
case MADV_GUARD_INSTALL:
case MADV_GUARD_REMOVE:
- return MADVISE_MMAP_READ_LOCK;
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
case MADV_FREE:
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index c193de6cb23a..737c407f4081 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -149,7 +149,7 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
- pud_t pudval = READ_ONCE(*pud);
+ pud_t pudval = pudp_get(pud);
/* Do not split a huge pud */
if (pud_trans_huge(pudval)) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b46356da6c0e..be810c1fbfc3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -81,6 +81,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
+EXPORT_SYMBOL(root_mem_cgroup);
/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
@@ -756,7 +757,7 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec,
}
/**
- * __mod_lruvec_state - update lruvec memory statistics
+ * mod_lruvec_state - update lruvec memory statistics
* @lruvec: the lruvec
* @idx: the stat item
* @val: delta to add to the counter, can be negative
@@ -765,18 +766,18 @@ static void mod_memcg_lruvec_state(struct lruvec *lruvec,
* function updates the all three counters that are affected by a
* change of state at this level: per-node, per-cgroup, per-lruvec.
*/
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val)
{
/* Update node */
- __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+ mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
/* Update memcg and lruvec */
if (!mem_cgroup_disabled())
mod_memcg_lruvec_state(lruvec, idx, val);
}
-void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
+void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
int val)
{
struct mem_cgroup *memcg;
@@ -788,17 +789,17 @@ void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx,
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg) {
rcu_read_unlock();
- __mod_node_page_state(pgdat, idx, val);
+ mod_node_page_state(pgdat, idx, val);
return;
}
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_lruvec_state(lruvec, idx, val);
+ mod_lruvec_state(lruvec, idx, val);
rcu_read_unlock();
}
-EXPORT_SYMBOL(__lruvec_stat_mod_folio);
+EXPORT_SYMBOL(lruvec_stat_mod_folio);
-void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
+void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
{
pg_data_t *pgdat = page_pgdat(virt_to_page(p));
struct mem_cgroup *memcg;
@@ -814,10 +815,10 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
* vmstats to keep it correct for the root memcg.
*/
if (!memcg) {
- __mod_node_page_state(pgdat, idx, val);
+ mod_node_page_state(pgdat, idx, val);
} else {
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- __mod_lruvec_state(lruvec, idx, val);
+ mod_lruvec_state(lruvec, idx, val);
}
rcu_read_unlock();
}
@@ -1625,6 +1626,37 @@ unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
return page_counter_read(&memcg->memory);
}
+void __memcg_memory_event(struct mem_cgroup *memcg,
+ enum memcg_memory_event event, bool allow_spinning)
+{
+ bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
+ event == MEMCG_SWAP_FAIL;
+
+ /* For now only MEMCG_MAX can happen with !allow_spinning context. */
+ VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX);
+
+ atomic_long_inc(&memcg->memory_events_local[event]);
+ if (!swap_event && allow_spinning)
+ cgroup_file_notify(&memcg->events_local_file);
+
+ do {
+ atomic_long_inc(&memcg->memory_events[event]);
+ if (allow_spinning) {
+ if (swap_event)
+ cgroup_file_notify(&memcg->swap_events_file);
+ else
+ cgroup_file_notify(&memcg->events_file);
+ }
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ break;
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ break;
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+}
+EXPORT_SYMBOL_GPL(__memcg_memory_event);
+
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
@@ -3880,6 +3912,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
zswap_memcg_offline_cleanup(memcg);
memcg_offline_kmem(memcg);
+ reparent_deferred_split_queue(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
lru_gen_offline_memcg(memcg);
@@ -4455,6 +4488,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
atomic_long_read(&events[MEMCG_OOM_KILL]));
seq_printf(m, "oom_group_kill %lu\n",
atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
+ seq_printf(m, "sock_throttled %lu\n",
+ atomic_long_read(&events[MEMCG_SOCK_THROTTLED]));
}
static int memory_events_show(struct seq_file *m, void *v)
@@ -5435,7 +5470,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
* @size: size of compressed object
*
* This forces the charge after obj_cgroup_may_zswap() allowed
- * compression and storage in zwap for this cgroup to go ahead.
+ * compression and storage in zswap for this cgroup to go ahead.
*/
void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
{
@@ -5593,3 +5628,16 @@ bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
{
return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
}
+
+void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
+{
+ if (mem_cgroup_disabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ if (!memcg)
+ memcg = root_mem_cgroup;
+
+ pr_warn("Memory cgroup min protection %lukB -- low protection %lukB",
+ K(atomic_long_read(&memcg->memory.children_min_usage)*PAGE_SIZE),
+ K(atomic_long_read(&memcg->memory.children_low_usage)*PAGE_SIZE));
+}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3edebb0cda30..fbc5a01260c8 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -38,6 +38,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/memory-failure.h>
#include <linux/page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
@@ -50,7 +51,7 @@
#include <linux/backing-dev.h>
#include <linux/migrate.h>
#include <linux/slab.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/hugetlb.h>
#include <linux/memory_hotplug.h>
#include <linux/mm_inline.h>
@@ -60,9 +61,12 @@
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include <linux/sysctl.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/memory-failure.h>
+
#include "swap.h"
#include "internal.h"
-#include "ras/ras_event.h"
static int sysctl_memory_failure_early_kill __read_mostly;
@@ -154,6 +158,10 @@ static const struct ctl_table memory_failure_table[] = {
}
};
+static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
+
+static DEFINE_MUTEX(pfn_space_lock);
+
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,
@@ -688,10 +696,10 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
if (pte_present(pte)) {
pfn = pte_pfn(pte);
} else {
- swp_entry_t swp = pte_to_swp_entry(pte);
+ const softleaf_t entry = softleaf_from_pte(pte);
- if (is_hwpoison_entry(swp))
- pfn = swp_offset_pfn(swp);
+ if (softleaf_is_hwpoison(entry))
+ pfn = softleaf_to_pfn(entry);
}
if (!pfn || pfn != poisoned_pfn)
@@ -885,6 +893,7 @@ static const char * const action_page_types[] = {
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_ALREADY_POISONED] = "already poisoned page",
+ [MF_MSG_PFN_MAP] = "non struct page pfn",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -1277,7 +1286,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
{
trace_memory_failure_event(pfn, type, result);
- if (type != MF_MSG_ALREADY_POISONED) {
+ if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) {
num_poisoned_pages_inc(pfn);
update_per_node_mf_stats(pfn, result);
}
@@ -1653,12 +1662,13 @@ static int identify_page_state(unsigned long pfn, struct page *p,
* there is still more to do, hence the page refcount we took earlier
* is still needed.
*/
-static int try_to_split_thp_page(struct page *page, bool release)
+static int try_to_split_thp_page(struct page *page, unsigned int new_order,
+ bool release)
{
int ret;
lock_page(page);
- ret = split_huge_page(page);
+ ret = split_huge_page_to_order(page, new_order);
unlock_page(page);
if (ret && release)
@@ -2140,8 +2150,140 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags,
{
LIST_HEAD(tokill);
+ folio_lock(folio);
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
+ folio_unlock(folio);
+
+ kill_procs(&tokill, true, pfn, flags);
+}
+
+int register_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+ guard(mutex)(&pfn_space_lock);
+
+ if (interval_tree_iter_first(&pfn_space_itree,
+ pfn_space->node.start,
+ pfn_space->node.last))
+ return -EBUSY;
+
+ interval_tree_insert(&pfn_space->node, &pfn_space_itree);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_pfn_address_space);
+
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+ guard(mutex)(&pfn_space_lock);
+
+ if (interval_tree_iter_first(&pfn_space_itree,
+ pfn_space->node.start,
+ pfn_space->node.last))
+ interval_tree_remove(&pfn_space->node, &pfn_space_itree);
+}
+EXPORT_SYMBOL_GPL(unregister_pfn_address_space);
+
+static void add_to_kill_pfn(struct task_struct *tsk,
+ struct vm_area_struct *vma,
+ struct list_head *to_kill,
+ unsigned long pfn)
+{
+ struct to_kill *tk;
+
+ tk = kmalloc(sizeof(*tk), GFP_ATOMIC);
+ if (!tk) {
+ pr_info("Unable to kill proc %d\n", tsk->pid);
+ return;
+ }
+
+ /* Check for pgoff not backed by struct page */
+ tk->addr = vma_address(vma, pfn, 1);
+ tk->size_shift = PAGE_SHIFT;
+
+ if (tk->addr == -EFAULT)
+ pr_info("Unable to find address %lx in %s\n",
+ pfn, tsk->comm);
+
+ get_task_struct(tsk);
+ tk->tsk = tsk;
+ list_add_tail(&tk->nd, to_kill);
+}
+
+/*
+ * Collect processes when the error hit a PFN not backed by struct page.
+ */
+static void collect_procs_pfn(struct address_space *mapping,
+ unsigned long pfn, struct list_head *to_kill)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+
+ i_mmap_lock_read(mapping);
+ rcu_read_lock();
+ for_each_process(tsk) {
+ struct task_struct *t = tsk;
+
+ t = task_early_kill(tsk, true);
+ if (!t)
+ continue;
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) {
+ if (vma->vm_mm == t->mm)
+ add_to_kill_pfn(t, vma, to_kill, pfn);
+ }
+ }
+ rcu_read_unlock();
+ i_mmap_unlock_read(mapping);
+}
+
+/**
+ * memory_failure_pfn - Handle memory failure on a page not backed by
+ * struct page.
+ * @pfn: Page Number of the corrupted page
+ * @flags: fine tune action taken
+ *
+ * Return:
+ * 0 - success,
+ * -EBUSY - Page PFN does not belong to any address space mapping.
+ */
+static int memory_failure_pfn(unsigned long pfn, int flags)
+{
+ struct interval_tree_node *node;
+ LIST_HEAD(tokill);
+
+ scoped_guard(mutex, &pfn_space_lock) {
+ bool mf_handled = false;
+
+ /*
+ * Modules registers with MM the address space mapping to
+ * the device memory they manage. Iterate to identify
+ * exactly which address space has mapped to this failing
+ * PFN.
+ */
+ for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node;
+ node = interval_tree_iter_next(node, pfn, pfn)) {
+ struct pfn_address_space *pfn_space =
+ container_of(node, struct pfn_address_space, node);
+
+ collect_procs_pfn(pfn_space->mapping, pfn, &tokill);
+
+ mf_handled = true;
+ }
+
+ if (!mf_handled)
+ return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED);
+ }
+
+ /*
+ * Unlike System-RAM there is no possibility to swap in a different
+ * physical page at a given virtual address, so all userspace
+ * consumption of direct PFN memory necessitates SIGBUS (i.e.
+ * MF_MUST_KILL)
+ */
+ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
kill_procs(&tokill, true, pfn, flags);
+
+ return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED);
}
/**
@@ -2193,6 +2335,14 @@ int memory_failure(unsigned long pfn, int flags)
if (res == 0)
goto unlock_mutex;
+ if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) {
+ /*
+ * The PFN is not backed by struct page.
+ */
+ res = memory_failure_pfn(pfn, flags);
+ goto unlock_mutex;
+ }
+
if (pfn_valid(pfn)) {
pgmap = get_dev_pagemap(pfn);
put_ref_page(pfn, flags);
@@ -2274,6 +2424,9 @@ try_again:
folio_unlock(folio);
if (folio_test_large(folio)) {
+ const int new_order = min_order_for_split(folio);
+ int err;
+
/*
* The flag must be set after the refcount is bumped
* otherwise it may race with THP split.
@@ -2288,7 +2441,16 @@ try_again:
* page is a valid handlable page.
*/
folio_set_has_hwpoisoned(folio);
- if (try_to_split_thp_page(p, false) < 0) {
+ err = try_to_split_thp_page(p, new_order, /* release= */ false);
+ /*
+ * If splitting a folio to order-0 fails, kill the process.
+ * Split the folio regardless to minimize unusable pages.
+ * Because the memory failure code cannot handle large
+ * folios, this split is always treated as if it failed.
+ */
+ if (err || new_order) {
+ /* get folio again in case the original one is split */
+ folio = page_folio(p);
res = -EHWPOISON;
kill_procs_now(p, pfn, flags, folio);
put_page(p);
@@ -2615,7 +2777,17 @@ static int soft_offline_in_use_page(struct page *page)
};
if (!huge && folio_test_large(folio)) {
- if (try_to_split_thp_page(page, true)) {
+ const int new_order = min_order_for_split(folio);
+
+ /*
+ * If new_order (target split order) is not 0, do not split the
+ * folio at all to retain the still accessible large folio.
+ * NOTE: if minimizing the number of soft offline pages is
+ * preferred, split it to non-zero new_order like it is done in
+ * memory_failure().
+ */
+ if (new_order || try_to_split_thp_page(page, /* new_order= */ 0,
+ /* release= */ true)) {
pr_info("%#lx: thp split failed\n", pfn);
return -EBUSY;
}
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0ea5c13f10a2..864811fff409 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -519,7 +519,7 @@ static inline void __init_node_memory_type(int node, struct memory_dev_type *mem
* for each device getting added in the same NUMA node
* with this specific memtype, bump the map count. We
* Only take memtype device reference once, so that
- * changing a node memtype can be done by droping the
+ * changing a node memtype can be done by dropping the
* only reference count taken here.
*/
diff --git a/mm/memory.c b/mm/memory.c
index aad432e71251..2a55edc48a65 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -60,7 +60,7 @@
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
@@ -76,13 +76,13 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
+#include <linux/pgalloc.h>
+#include <linux/uaccess.h>
#include <trace/events/kmem.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
-#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
@@ -109,7 +109,7 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
return false;
- return pte_marker_uffd_wp(vmf->orig_pte);
+ return pte_is_uffd_wp_marker(vmf->orig_pte);
}
/*
@@ -902,7 +902,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
static int try_restore_exclusive_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep, pte_t orig_pte)
{
- struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte));
+ const softleaf_t entry = softleaf_from_pte(orig_pte);
+ struct page *page = softleaf_to_page(entry);
struct folio *folio = page_folio(page);
if (folio_trylock(folio)) {
@@ -927,12 +928,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
{
vm_flags_t vm_flags = dst_vma->vm_flags;
pte_t orig_pte = ptep_get(src_pte);
+ softleaf_t entry = softleaf_from_pte(orig_pte);
pte_t pte = orig_pte;
struct folio *folio;
struct page *page;
- swp_entry_t entry = pte_to_swp_entry(orig_pte);
- if (likely(!non_swap_entry(entry))) {
+ if (likely(softleaf_is_swap(entry))) {
if (swap_duplicate(entry) < 0)
return -EIO;
@@ -950,12 +951,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
set_pte_at(src_mm, addr, src_pte, pte);
}
rss[MM_SWAPENTS]++;
- } else if (is_migration_entry(entry)) {
- folio = pfn_swap_entry_folio(entry);
+ } else if (softleaf_is_migration(entry)) {
+ folio = softleaf_to_folio(entry);
rss[mm_counter(folio)]++;
- if (!is_readable_migration_entry(entry) &&
+ if (!softleaf_is_migration_read(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both parent and child
@@ -964,15 +965,15 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
entry = make_readable_migration_entry(
swp_offset(entry));
- pte = swp_entry_to_pte(entry);
+ pte = softleaf_to_pte(entry);
if (pte_swp_soft_dirty(orig_pte))
pte = pte_swp_mksoft_dirty(pte);
if (pte_swp_uffd_wp(orig_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
- } else if (is_device_private_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
+ } else if (softleaf_is_device_private(entry)) {
+ page = softleaf_to_page(entry);
folio = page_folio(page);
/*
@@ -996,7 +997,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* when a device driver is involved (you cannot easily
* save and restore device driver state).
*/
- if (is_writable_device_private_entry(entry) &&
+ if (softleaf_is_device_private_write(entry) &&
is_cow_mapping(vm_flags)) {
entry = make_readable_device_private_entry(
swp_offset(entry));
@@ -1005,7 +1006,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
- } else if (is_device_exclusive_entry(entry)) {
+ } else if (softleaf_is_device_exclusive(entry)) {
/*
* Make device exclusive entries present by restoring the
* original entry then copying as for a present pte. Device
@@ -1016,7 +1017,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
return -EBUSY;
return -ENOENT;
- } else if (is_pte_marker_entry(entry)) {
+ } else if (softleaf_is_marker(entry)) {
pte_marker marker = copy_pte_marker(entry, dst_vma);
if (marker)
@@ -1217,7 +1218,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
spinlock_t *src_ptl, *dst_ptl;
int progress, max_nr, ret = 0;
int rss[NR_MM_COUNTERS];
- swp_entry_t entry = (swp_entry_t){0};
+ softleaf_t entry = softleaf_mk_none();
struct folio *prealloc = NULL;
int nr;
@@ -1281,7 +1282,7 @@ again:
dst_vma, src_vma,
addr, rss);
if (ret == -EIO) {
- entry = pte_to_swp_entry(ptep_get(src_pte));
+ entry = softleaf_from_pte(ptep_get(src_pte));
break;
} else if (ret == -EBUSY) {
break;
@@ -1374,8 +1375,9 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) {
+ if (pmd_is_huge(*src_pmd)) {
int err;
+
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
addr, dst_vma, src_vma);
@@ -1463,18 +1465,12 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
static bool
vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
+ if (src_vma->vm_flags & VM_COPY_ON_FORK)
+ return true;
/*
- * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
- * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
- * contains uffd-wp protection information, that's something we can't
- * retrieve from page cache, and skip copying will lose those info.
+ * The presence of an anon_vma indicates an anonymous VMA has page
+ * tables which naturally cannot be reconstituted on page fault.
*/
- if (userfaultfd_wp(dst_vma))
- return true;
-
- if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
- return true;
-
if (src_vma->anon_vma)
return true;
@@ -1594,7 +1590,9 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
{
bool was_installed = false;
-#ifdef CONFIG_PTE_MARKER_UFFD_WP
+ if (!uffd_supports_wp_marker())
+ return false;
+
/* Zap on anonymous always means dropping everything */
if (vma_is_anonymous(vma))
return false;
@@ -1611,7 +1609,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
pte++;
addr += PAGE_SIZE;
}
-#endif
+
return was_installed;
}
@@ -1717,14 +1715,14 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
unsigned int max_nr, unsigned long addr,
struct zap_details *details, int *rss, bool *any_skipped)
{
- swp_entry_t entry;
+ softleaf_t entry;
int nr = 1;
*any_skipped = true;
- entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry) ||
- is_device_exclusive_entry(entry)) {
- struct page *page = pfn_swap_entry_to_page(entry);
+ entry = softleaf_from_pte(ptent);
+ if (softleaf_is_device_private(entry) ||
+ softleaf_is_device_exclusive(entry)) {
+ struct page *page = softleaf_to_page(entry);
struct folio *folio = page_folio(page);
if (unlikely(!should_zap_folio(details, folio)))
@@ -1739,7 +1737,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
rss[mm_counter(folio)]--;
folio_remove_rmap_pte(folio, page, vma);
folio_put(folio);
- } else if (!non_swap_entry(entry)) {
+ } else if (softleaf_is_swap(entry)) {
/* Genuine swap entries, hence a private anon pages */
if (!should_zap_cows(details))
return 1;
@@ -1747,20 +1745,20 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
nr = swap_pte_batch(pte, max_nr, ptent);
rss[MM_SWAPENTS] -= nr;
free_swap_and_cache_nr(entry, nr);
- } else if (is_migration_entry(entry)) {
- struct folio *folio = pfn_swap_entry_folio(entry);
+ } else if (softleaf_is_migration(entry)) {
+ struct folio *folio = softleaf_to_folio(entry);
if (!should_zap_folio(details, folio))
return 1;
rss[mm_counter(folio)]--;
- } else if (pte_marker_entry_uffd_wp(entry)) {
+ } else if (softleaf_is_uffd_wp_marker(entry)) {
/*
* For anon: always drop the marker; for file: only
* drop the marker if explicitly requested.
*/
if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
return 1;
- } else if (is_guard_swp_entry(entry)) {
+ } else if (softleaf_is_guard_marker(entry)) {
/*
* Ordinary zapping should not remove guard PTE
* markers. Only do so if we should remove PTE markers
@@ -1768,7 +1766,8 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
*/
if (!zap_drop_markers(details))
return 1;
- } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) {
+ } else if (softleaf_is_hwpoison(entry) ||
+ softleaf_is_poison_marker(entry)) {
if (!should_zap_cows(details))
return 1;
} else {
@@ -1921,7 +1920,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) {
+ if (pmd_is_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
__split_huge_pmd(vma, pmd, addr, false);
else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
@@ -2023,8 +2022,7 @@ void unmap_page_range(struct mmu_gather *tlb,
static void unmap_single_vma(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr,
- struct zap_details *details, bool mm_wr_locked)
+ unsigned long end_addr, struct zap_details *details)
{
unsigned long start = max(vma->vm_start, start_addr);
unsigned long end;
@@ -2070,7 +2068,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
* @tree_end: The maximum index to check
- * @mm_wr_locked: lock flag
*
* Unmap all pages in the vma list.
*
@@ -2085,8 +2082,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
*/
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, unsigned long tree_end,
- bool mm_wr_locked)
+ unsigned long end_addr, unsigned long tree_end)
{
struct mmu_notifier_range range;
struct zap_details details = {
@@ -2102,8 +2098,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
unsigned long start = start_addr;
unsigned long end = end_addr;
hugetlb_zap_begin(vma, &start, &end);
- unmap_single_vma(tlb, vma, start, end, &details,
- mm_wr_locked);
+ unmap_single_vma(tlb, vma, start, end, &details);
hugetlb_zap_end(vma, &details);
vma = mas_find(mas, tree_end - 1);
} while (vma && likely(!xa_is_zero(vma)));
@@ -2139,7 +2134,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb,
* unmap 'address-end' not 'range.start-range.end' as range
* could have been expanded for hugetlb pmd sharing.
*/
- unmap_single_vma(tlb, vma, address, end, details, false);
+ unmap_single_vma(tlb, vma, address, end, details);
mmu_notifier_invalidate_range_end(&range);
if (is_vm_hugetlb_page(vma)) {
/*
@@ -2900,6 +2895,25 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
+static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
+ unsigned long end, unsigned long vm_start, unsigned long vm_end,
+ unsigned long pfn, pgoff_t *vm_pgoff_p)
+{
+ /*
+ * There's a horrible special case to handle copy-on-write
+ * behaviour that some programs depend on. We mark the "original"
+ * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+ * See vm_normal_page() for details.
+ */
+ if (is_cow_mapping(vm_flags)) {
+ if (addr != vm_start || end != vm_end)
+ return -EINVAL;
+ *vm_pgoff_p = pfn;
+ }
+
+ return 0;
+}
+
static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
@@ -2912,31 +2926,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
return -EINVAL;
- /*
- * Physically remapped pages are special. Tell the
- * rest of the world about it:
- * VM_IO tells people not to look at these pages
- * (accesses can have side effects).
- * VM_PFNMAP tells the core MM that the base pages are just
- * raw PFN mappings, and do not have a "struct page" associated
- * with them.
- * VM_DONTEXPAND
- * Disable vma merging and expanding with mremap().
- * VM_DONTDUMP
- * Omit vma from core dump, even when VM_IO turned off.
- *
- * There's a horrible special case to handle copy-on-write
- * behaviour that some programs depend on. We mark the "original"
- * un-COW'ed pages by matching them up with "vma->vm_pgoff".
- * See vm_normal_page() for details.
- */
- if (is_cow_mapping(vma->vm_flags)) {
- if (addr != vma->vm_start || end != vma->vm_end)
- return -EINVAL;
- vma->vm_pgoff = pfn;
- }
-
- vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+ VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS);
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
@@ -2957,7 +2947,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
* Variant of remap_pfn_range that does not call track_pfn_remap. The caller
* must have pre-validated the caching bits of the pgprot_t.
*/
-int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
@@ -3002,23 +2992,9 @@ void pfnmap_track_ctx_release(struct kref *ref)
pfnmap_untrack(ctx->pfn, ctx->size);
kfree(ctx);
}
-#endif /* __HAVE_PFNMAP_TRACKING */
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-#ifdef __HAVE_PFNMAP_TRACKING
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
struct pfnmap_track_ctx *ctx = NULL;
int err;
@@ -3054,15 +3030,78 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
return err;
}
+static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range_track(vma, addr, pfn, size, prot);
+}
#else
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
}
#endif
+
+void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
+{
+ /*
+ * We set addr=VMA start, end=VMA end here, so this won't fail, but we
+ * check it again on complete and will fail there if specified addr is
+ * invalid.
+ */
+ get_remap_pgoff(desc->vm_flags, desc->start, desc->end,
+ desc->start, desc->end, pfn, &desc->pgoff);
+ desc->vm_flags |= VM_REMAP_FLAGS;
+}
+
+static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size)
+{
+ unsigned long end = addr + PAGE_ALIGN(size);
+ int err;
+
+ err = get_remap_pgoff(vma->vm_flags, addr, end,
+ vma->vm_start, vma->vm_end,
+ pfn, &vma->vm_pgoff);
+ if (err)
+ return err;
+
+ vm_flags_set(vma, VM_REMAP_FLAGS);
+ return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ int err;
+
+ err = remap_pfn_range_prepare_vma(vma, addr, pfn, size);
+ if (err)
+ return err;
+
+ return do_remap_pfn_range(vma, addr, pfn, size, prot);
+}
EXPORT_SYMBOL(remap_pfn_range);
+int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return do_remap_pfn_range(vma, addr, pfn, size, prot);
+}
+
/**
* vm_iomap_memory - remap memory to userspace
* @vma: user vma to map to
@@ -4328,7 +4367,7 @@ static inline bool should_try_to_free_swap(struct folio *folio,
* If we want to map a page that's in the swapcache writable, we
* have to detect via the refcount if we're really the exclusive
* user. Try freeing the swapcache to get rid of the swapcache
- * reference only in case it's likely that we'll be the exlusive user.
+ * reference only in case it's likely that we'll be the exclusive user.
*/
return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
folio_ref_count(folio) == (1 + folio_nr_pages(folio));
@@ -4346,7 +4385,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
*
* This should also cover the case where e.g. the pte changed
* quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
- * So is_pte_marker() check is not enough to safely drop the pte.
+ * So pte_is_marker() check is not enough to safely drop the pte.
*/
if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
@@ -4380,8 +4419,8 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
{
- swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
- unsigned long marker = pte_marker_get(entry);
+ const softleaf_t entry = softleaf_from_pte(vmf->orig_pte);
+ const pte_marker marker = softleaf_to_marker(entry);
/*
* PTE markers should never be empty. If anything weird happened,
@@ -4398,7 +4437,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
if (marker & PTE_MARKER_GUARD)
return VM_FAULT_SIGSEGV;
- if (pte_marker_entry_uffd_wp(entry))
+ if (softleaf_is_uffd_wp_marker(entry))
return pte_marker_handle_uffd_wp(vmf);
/* This is an unknown pte marker */
@@ -4409,13 +4448,13 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *folio;
- swp_entry_t entry;
+ softleaf_t entry;
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
if (!folio)
return NULL;
- entry = pte_to_swp_entry(vmf->orig_pte);
+ entry = softleaf_from_pte(vmf->orig_pte);
if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
GFP_KERNEL, entry)) {
folio_put(folio);
@@ -4433,7 +4472,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
{
unsigned long addr;
- swp_entry_t entry;
+ softleaf_t entry;
int idx;
pte_t pte;
@@ -4443,7 +4482,7 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
return false;
- entry = pte_to_swp_entry(pte);
+ entry = softleaf_from_pte(pte);
if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
return false;
@@ -4489,7 +4528,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
unsigned long orders;
struct folio *folio;
unsigned long addr;
- swp_entry_t entry;
+ softleaf_t entry;
spinlock_t *ptl;
pte_t *pte;
gfp_t gfp;
@@ -4510,7 +4549,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
if (!zswap_never_enabled())
goto fallback;
- entry = pte_to_swp_entry(vmf->orig_pte);
+ entry = softleaf_from_pte(vmf->orig_pte);
/*
* Get a list of all the (large) orders below PMD_ORDER that are enabled
* and suitable for swapping THP.
@@ -4589,7 +4628,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
rmap_t rmap_flags = RMAP_NONE;
bool need_clear_cache = false;
bool exclusive = false;
- swp_entry_t entry;
+ softleaf_t entry;
pte_t pte;
vm_fault_t ret = 0;
void *shadow = NULL;
@@ -4601,15 +4640,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!pte_unmap_same(vmf))
goto out;
- entry = pte_to_swp_entry(vmf->orig_pte);
- if (unlikely(non_swap_entry(entry))) {
- if (is_migration_entry(entry)) {
+ entry = softleaf_from_pte(vmf->orig_pte);
+ if (unlikely(!softleaf_is_swap(entry))) {
+ if (softleaf_is_migration(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
- } else if (is_device_exclusive_entry(entry)) {
- vmf->page = pfn_swap_entry_to_page(entry);
+ } else if (softleaf_is_device_exclusive(entry)) {
+ vmf->page = softleaf_to_page(entry);
ret = remove_device_exclusive_entry(vmf);
- } else if (is_device_private_entry(entry)) {
+ } else if (softleaf_is_device_private(entry)) {
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
/*
* migrate_to_ram is not yet ready to operate
@@ -4620,7 +4659,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out;
}
- vmf->page = pfn_swap_entry_to_page(entry);
+ vmf->page = softleaf_to_page(entry);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (unlikely(!vmf->pte ||
@@ -4644,9 +4683,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
} else {
pte_unmap_unlock(vmf->pte, vmf->ptl);
}
- } else if (is_hwpoison_entry(entry)) {
+ } else if (softleaf_is_hwpoison(entry)) {
ret = VM_FAULT_HWPOISON;
- } else if (is_pte_marker_entry(entry)) {
+ } else if (softleaf_is_marker(entry)) {
ret = handle_pte_marker(vmf);
} else {
print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
@@ -5405,7 +5444,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa
/**
* set_pte_range - Set a range of PTEs to point to pages in a folio.
- * @vmf: Fault decription.
+ * @vmf: Fault description.
* @folio: The folio that contains @page.
* @page: The first page to create a PTE for.
* @nr: The number of PTEs to create.
@@ -6332,37 +6371,43 @@ retry_pud:
if (pmd_none(*vmf.pmd) &&
thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) {
ret = create_huge_pmd(&vmf);
- if (!(ret & VM_FAULT_FALLBACK))
+ if (ret & VM_FAULT_FALLBACK)
+ goto fallback;
+ else
return ret;
- } else {
- vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
+ }
- if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
- VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(vmf.orig_pmd));
- if (is_pmd_migration_entry(vmf.orig_pmd))
- pmd_migration_entry_wait(mm, vmf.pmd);
- return 0;
- }
- if (pmd_trans_huge(vmf.orig_pmd)) {
- if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&vmf);
+ vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
+ if (pmd_none(vmf.orig_pmd))
+ goto fallback;
- if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
- !pmd_write(vmf.orig_pmd)) {
- ret = wp_huge_pmd(&vmf);
- if (!(ret & VM_FAULT_FALLBACK))
- return ret;
- } else {
- vmf.ptl = pmd_lock(mm, vmf.pmd);
- if (!huge_pmd_set_accessed(&vmf))
- fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
- spin_unlock(vmf.ptl);
- return 0;
- }
+ if (unlikely(!pmd_present(vmf.orig_pmd))) {
+ if (pmd_is_device_private_entry(vmf.orig_pmd))
+ return do_huge_pmd_device_private(&vmf);
+
+ if (pmd_is_migration_entry(vmf.orig_pmd))
+ pmd_migration_entry_wait(mm, vmf.pmd);
+ return 0;
+ }
+ if (pmd_trans_huge(vmf.orig_pmd)) {
+ if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+ return do_huge_pmd_numa_page(&vmf);
+
+ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+ !pmd_write(vmf.orig_pmd)) {
+ ret = wp_huge_pmd(&vmf);
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+ } else {
+ vmf.ptl = pmd_lock(mm, vmf.pmd);
+ if (!huge_pmd_set_accessed(&vmf))
+ fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
+ spin_unlock(vmf.ptl);
+ return 0;
}
}
+fallback:
return handle_pte_fault(&vmf);
}
@@ -6720,12 +6765,12 @@ retry:
goto out;
p4dp = p4d_offset(pgdp, address);
- p4d = READ_ONCE(*p4dp);
+ p4d = p4dp_get(p4dp);
if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
goto out;
pudp = pud_offset(p4dp, address);
- pud = READ_ONCE(*pudp);
+ pud = pudp_get(pudp);
if (pud_none(pud))
goto out;
if (pud_leaf(pud)) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 238a6712738e..a63ec679d861 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1302,7 +1302,7 @@ static int __try_online_node(int nid, bool set_node_online)
if (set_node_online) {
node_set_online(nid);
- ret = register_one_node(nid);
+ ret = register_node(nid);
BUG_ON(ret);
}
out:
@@ -1531,7 +1531,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
goto error_memblock_remove;
if (ret) {
node_set_online(nid);
- ret = register_one_node(nid);
+ ret = register_node(nid);
if (WARN_ON(ret)) {
node_set_offline(nid);
goto error_memblock_remove;
@@ -1585,7 +1585,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
error:
if (new_node) {
node_set_offline(nid);
- unregister_one_node(nid);
+ unregister_node(nid);
}
error_memblock_remove:
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
@@ -2190,7 +2190,7 @@ void try_offline_node(int nid)
* node now.
*/
node_set_offline(nid);
- unregister_one_node(nid);
+ unregister_node(nid);
}
EXPORT_SYMBOL(try_offline_node);
@@ -2316,7 +2316,7 @@ static int try_offline_memory_block(struct memory_block *mem, void *arg)
* by offlining code ... so we don't care about that.
*/
page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
- if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
+ if (page && page_zonenum(page) == ZONE_MOVABLE)
online_type = MMOP_ONLINE_MOVABLE;
rc = device_offline(&mem->dev);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eb83cff7db8c..acb9bf89f619 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -85,6 +85,7 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
+#include <linux/sched/sysctl.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
@@ -99,6 +100,7 @@
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/memory-tiers.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
@@ -108,7 +110,7 @@
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/gcd.h>
#include <asm/tlbflush.h>
@@ -645,7 +647,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
struct folio *folio;
struct queue_pages *qp = walk->private;
- if (unlikely(is_pmd_migration_entry(*pmd))) {
+ if (unlikely(pmd_is_migration_entry(*pmd))) {
qp->nr_failed++;
return;
}
@@ -703,7 +705,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
if (pte_none(ptent))
continue;
if (!pte_present(ptent)) {
- if (is_migration_entry(pte_to_swp_entry(ptent)))
+ const softleaf_t entry = softleaf_from_pte(ptent);
+
+ if (softleaf_is_migration(entry))
qp->nr_failed++;
continue;
}
@@ -766,16 +770,21 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long flags = qp->flags;
struct folio *folio;
spinlock_t *ptl;
- pte_t entry;
+ pte_t ptep;
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
- entry = huge_ptep_get(walk->mm, addr, pte);
- if (!pte_present(entry)) {
- if (unlikely(is_hugetlb_entry_migration(entry)))
- qp->nr_failed++;
+ ptep = huge_ptep_get(walk->mm, addr, pte);
+ if (!pte_present(ptep)) {
+ if (!huge_pte_none(ptep)) {
+ const softleaf_t entry = softleaf_from_pte(ptep);
+
+ if (unlikely(softleaf_is_migration(entry)))
+ qp->nr_failed++;
+ }
+
goto unlock;
}
- folio = pfn_folio(pte_pfn(entry));
+ folio = pfn_folio(pte_pfn(ptep));
if (!queue_folio_required(folio, qp))
goto unlock;
if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
@@ -803,6 +812,65 @@ unlock:
}
#ifdef CONFIG_NUMA_BALANCING
+/**
+ * folio_can_map_prot_numa() - check whether the folio can map prot numa
+ * @folio: The folio whose mapping considered for being made NUMA hintable
+ * @vma: The VMA that the folio belongs to.
+ * @is_private_single_threaded: Is this a single-threaded private VMA or not
+ *
+ * This function checks to see if the folio actually indicates that
+ * we need to make the mapping one which causes a NUMA hinting fault,
+ * as there are cases where it's simply unnecessary, and the folio's
+ * access time is adjusted for memory tiering if prot numa needed.
+ *
+ * Return: True if the mapping of the folio needs to be changed, false otherwise.
+ */
+bool folio_can_map_prot_numa(struct folio *folio, struct vm_area_struct *vma,
+ bool is_private_single_threaded)
+{
+ int nid;
+
+ if (!folio || folio_is_zone_device(folio) || folio_test_ksm(folio))
+ return false;
+
+ /* Also skip shared copy-on-write folios */
+ if (is_cow_mapping(vma->vm_flags) && folio_maybe_mapped_shared(folio))
+ return false;
+
+ /* Folios are pinned and can't be migrated */
+ if (folio_maybe_dma_pinned(folio))
+ return false;
+
+ /*
+ * While migration can move some dirty folios,
+ * it cannot move them all from MIGRATE_ASYNC
+ * context.
+ */
+ if (folio_is_file_lru(folio) && folio_test_dirty(folio))
+ return false;
+
+ /*
+ * Don't mess with PTEs if folio is already on the node
+ * a single-threaded process is running on.
+ */
+ nid = folio_nid(folio);
+ if (is_private_single_threaded && (nid == numa_node_id()))
+ return false;
+
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ node_is_toptier(nid))
+ return false;
+
+ if (folio_use_access_time(folio))
+ folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
+
+ return true;
+}
+
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
diff --git a/mm/memremap.c b/mm/memremap.c
index 46cb1b0b6f72..4c2e0d68eb27 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -289,8 +289,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
WARN(1, "Missing migrate_to_ram method\n");
return ERR_PTR(-EINVAL);
}
- if (!pgmap->ops->page_free) {
- WARN(1, "Missing page_free method\n");
+ if (!pgmap->ops->folio_free) {
+ WARN(1, "Missing folio_free method\n");
return ERR_PTR(-EINVAL);
}
if (!pgmap->owner) {
@@ -299,8 +299,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
break;
case MEMORY_DEVICE_COHERENT:
- if (!pgmap->ops->page_free) {
- WARN(1, "Missing page_free method\n");
+ if (!pgmap->ops->folio_free) {
+ WARN(1, "Missing folio_free method\n");
return ERR_PTR(-EINVAL);
}
if (!pgmap->owner) {
@@ -416,20 +416,19 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
void free_zone_device_folio(struct folio *folio)
{
struct dev_pagemap *pgmap = folio->pgmap;
+ unsigned long nr = folio_nr_pages(folio);
+ int i;
if (WARN_ON_ONCE(!pgmap))
return;
mem_cgroup_uncharge(folio);
- /*
- * Note: we don't expect anonymous compound pages yet. Once supported
- * and we could PTE-map them similar to THP, we'd have to clear
- * PG_anon_exclusive on all tail pages.
- */
if (folio_test_anon(folio)) {
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
- __ClearPageAnonExclusive(folio_page(folio, 0));
+ for (i = 0; i < nr; i++)
+ __ClearPageAnonExclusive(folio_page(folio, i));
+ } else {
+ VM_WARN_ON_ONCE(folio_test_large(folio));
}
/*
@@ -454,10 +453,10 @@ void free_zone_device_folio(struct folio *folio)
switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_COHERENT:
- if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+ if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free))
break;
- pgmap->ops->page_free(folio_page(folio, 0));
- put_dev_pagemap(pgmap);
+ pgmap->ops->folio_free(folio);
+ percpu_ref_put_many(&folio->pgmap->ref, nr);
break;
case MEMORY_DEVICE_GENERIC:
@@ -473,21 +472,26 @@ void free_zone_device_folio(struct folio *folio)
break;
case MEMORY_DEVICE_PCI_P2PDMA:
- if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->page_free))
+ if (WARN_ON_ONCE(!pgmap->ops || !pgmap->ops->folio_free))
break;
- pgmap->ops->page_free(folio_page(folio, 0));
+ pgmap->ops->folio_free(folio);
break;
}
}
-void zone_device_page_init(struct page *page)
+void zone_device_page_init(struct page *page, unsigned int order)
{
+ VM_WARN_ON_ONCE(order > MAX_ORDER_NR_PAGES);
+
/*
* Drivers shouldn't be allocating pages after calling
* memunmap_pages().
*/
- WARN_ON_ONCE(!percpu_ref_tryget_live(&page_pgmap(page)->ref));
+ WARN_ON_ONCE(!percpu_ref_tryget_many(&page_pgmap(page)->ref, 1 << order));
set_page_count(page, 1);
lock_page(page);
+
+ if (order)
+ prep_compound_page(page, order);
}
EXPORT_SYMBOL_GPL(zone_device_page_init);
diff --git a/mm/migrate.c b/mm/migrate.c
index c0e9f15be2a2..5169f9717f60 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -16,7 +16,7 @@
#include <linux/migrate.h>
#include <linux/export.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/mm_inline.h>
@@ -307,6 +307,7 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(pte_present(old_pte), page);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_device_private(folio), folio);
if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
mm_forbids_zeropage(pvmw->vma->vm_mm))
@@ -352,7 +353,7 @@ static bool remove_migration_pte(struct folio *folio,
rmap_t rmap_flags = RMAP_NONE;
pte_t old_pte;
pte_t pte;
- swp_entry_t entry;
+ softleaf_t entry;
struct page *new;
unsigned long idx = 0;
@@ -378,22 +379,22 @@ static bool remove_migration_pte(struct folio *folio,
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
- entry = pte_to_swp_entry(old_pte);
- if (!is_migration_entry_young(entry))
+ entry = softleaf_from_pte(old_pte);
+ if (!softleaf_is_migration_young(entry))
pte = pte_mkold(pte);
- if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
+ if (folio_test_dirty(folio) && softleaf_is_migration_dirty(entry))
pte = pte_mkdirty(pte);
if (pte_swp_soft_dirty(old_pte))
pte = pte_mksoft_dirty(pte);
else
pte = pte_clear_soft_dirty(pte);
- if (is_writable_migration_entry(entry))
+ if (softleaf_is_migration_write(entry))
pte = pte_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(old_pte))
pte = pte_mkuffd_wp(pte);
- if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
+ if (folio_test_anon(folio) && !softleaf_is_migration_read(entry))
rmap_flags |= RMAP_EXCLUSIVE;
if (unlikely(is_device_private_page(new))) {
@@ -403,7 +404,7 @@ static bool remove_migration_pte(struct folio *folio,
else
entry = make_readable_device_private_entry(
page_to_pfn(new));
- pte = swp_entry_to_pte(entry);
+ pte = softleaf_to_pte(entry);
if (pte_swp_soft_dirty(old_pte))
pte = pte_swp_mksoft_dirty(pte);
if (pte_swp_uffd_wp(old_pte))
@@ -482,7 +483,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
spinlock_t *ptl;
pte_t *ptep;
pte_t pte;
- swp_entry_t entry;
+ softleaf_t entry;
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!ptep)
@@ -491,11 +492,11 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
pte = ptep_get(ptep);
pte_unmap(ptep);
- if (!is_swap_pte(pte))
+ if (pte_none(pte) || pte_present(pte))
goto out;
- entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry))
+ entry = softleaf_from_pte(pte);
+ if (!softleaf_is_migration(entry))
goto out;
migration_entry_wait_on_locked(entry, ptl);
@@ -514,16 +515,18 @@ out:
void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep);
+ softleaf_t entry;
pte_t pte;
hugetlb_vma_assert_locked(vma);
spin_lock(ptl);
pte = huge_ptep_get(vma->vm_mm, addr, ptep);
- if (unlikely(!is_hugetlb_entry_migration(pte))) {
- spin_unlock(ptl);
- hugetlb_vma_unlock_read(vma);
- } else {
+ if (huge_pte_none(pte))
+ goto fail;
+
+ entry = softleaf_from_pte(pte);
+ if (softleaf_is_migration(entry)) {
/*
* If migration entry existed, safe to release vma lock
* here because the pgtable page won't be freed without the
@@ -531,8 +534,13 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p
* lock release in migration_entry_wait_on_locked().
*/
hugetlb_vma_unlock_read(vma);
- migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl);
+ migration_entry_wait_on_locked(entry, ptl);
+ return;
}
+
+fail:
+ spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
}
#endif
@@ -542,9 +550,9 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
spinlock_t *ptl;
ptl = pmd_lock(mm, pmd);
- if (!is_pmd_migration_entry(*pmd))
+ if (!pmd_is_migration_entry(*pmd))
goto unlock;
- migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl);
+ migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl);
return;
unlock:
spin_unlock(ptl);
@@ -562,7 +570,7 @@ unlock:
static int __folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int expected_count)
{
- XA_STATE(xas, &mapping->i_pages, folio_index(folio));
+ XA_STATE(xas, &mapping->i_pages, folio->index);
struct swap_cluster_info *ci = NULL;
struct zone *oldzone, *newzone;
int dirty;
@@ -667,27 +675,27 @@ static int __folio_migrate_mapping(struct address_space *mapping,
old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
- __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
- __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
+ mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
+ mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
- __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
- __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
+ mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
+ mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
if (folio_test_pmd_mappable(folio)) {
- __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
- __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
+ mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
+ mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
}
}
#ifdef CONFIG_SWAP
if (folio_test_swapcache(folio)) {
- __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
- __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
+ mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
+ mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
}
#endif
if (dirty && mapping_can_writeback(mapping)) {
- __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
+ mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
- __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
+ mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
__mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
}
}
@@ -715,7 +723,7 @@ EXPORT_SYMBOL(folio_migrate_mapping);
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src)
{
- XA_STATE(xas, &mapping->i_pages, folio_index(src));
+ XA_STATE(xas, &mapping->i_pages, src->index);
int rc, expected_count = folio_expected_ref_count(src) + 1;
if (folio_ref_count(src) != expected_count)
@@ -2164,7 +2172,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private)
gfp_t gfp_mask;
unsigned int order = 0;
int nid;
- int zidx;
+ enum zone_type zidx;
mtc = (struct migration_target_control *)private;
gfp_mask = mtc->gfp_mask;
@@ -2190,7 +2198,7 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private)
gfp_mask |= GFP_TRANSHUGE;
order = folio_order(src);
}
- zidx = zone_idx(folio_zone(src));
+ zidx = folio_zonenum(src);
if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index abd9f6850db6..23379663b1e1 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -13,7 +13,8 @@
#include <linux/oom.h>
#include <linux/pagewalk.h>
#include <linux/rmap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
+#include <linux/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
if (!vma_is_anonymous(walk->vma))
return migrate_vma_collect_skip(start, end, walk);
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ migrate->cpages++;
+
+ /*
+ * Collect the remaining entries as holes, in case we
+ * need to split later
+ */
+ return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+ }
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
@@ -54,70 +72,214 @@ static int migrate_vma_collect_hole(unsigned long start,
return 0;
}
-static int migrate_vma_collect_pmd(pmd_t *pmdp,
- unsigned long start,
- unsigned long end,
- struct mm_walk *walk)
+/**
+ * migrate_vma_split_folio() - Helper function to split a THP folio
+ * @folio: the folio to split
+ * @fault_page: struct page associated with the fault if any
+ *
+ * Returns 0 on success
+ */
+static int migrate_vma_split_folio(struct folio *folio,
+ struct page *fault_page)
+{
+ int ret;
+ struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
+ struct folio *new_fault_folio = NULL;
+
+ if (folio != fault_folio) {
+ folio_get(folio);
+ folio_lock(folio);
+ }
+
+ ret = split_folio(folio);
+ if (ret) {
+ if (folio != fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ return ret;
+ }
+
+ new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
+
+ /*
+ * Ensure the lock is held on the correct
+ * folio after the split
+ */
+ if (!new_fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ } else if (folio != new_fault_folio) {
+ if (new_fault_folio != fault_folio) {
+ folio_get(new_fault_folio);
+ folio_lock(new_fault_folio);
+ }
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ return 0;
+}
+
+/** migrate_vma_collect_huge_pmd - collect THP pages without splitting the
+ * folio for device private pages.
+ * @pmdp: pointer to pmd entry
+ * @start: start address of the range for migration
+ * @end: end address of the range for migration
+ * @walk: mm_walk callback structure
+ * @fault_folio: folio associated with the fault if any
+ *
+ * Collect the huge pmd entry at @pmdp for migration and set the
+ * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that
+ * migration will occur at HPAGE_PMD granularity
+ */
+static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
+ unsigned long end, struct mm_walk *walk,
+ struct folio *fault_folio)
{
+ struct mm_struct *mm = walk->mm;
+ struct folio *folio;
struct migrate_vma *migrate = walk->private;
- struct folio *fault_folio = migrate->fault_page ?
- page_folio(migrate->fault_page) : NULL;
- struct vm_area_struct *vma = walk->vma;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long addr = start, unmapped = 0;
spinlock_t *ptl;
- pte_t *ptep;
+ int ret;
+ unsigned long write = 0;
-again:
- if (pmd_none(*pmdp))
+ ptl = pmd_lock(mm, pmdp);
+ if (pmd_none(*pmdp)) {
+ spin_unlock(ptl);
return migrate_vma_collect_hole(start, end, -1, walk);
+ }
if (pmd_trans_huge(*pmdp)) {
- struct folio *folio;
-
- ptl = pmd_lock(mm, pmdp);
- if (unlikely(!pmd_trans_huge(*pmdp))) {
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
spin_unlock(ptl);
- goto again;
+ return migrate_vma_collect_skip(start, end, walk);
}
folio = pmd_folio(*pmdp);
if (is_huge_zero_folio(folio)) {
spin_unlock(ptl);
- split_huge_pmd(vma, pmdp, addr);
- } else {
- int ret;
+ return migrate_vma_collect_hole(start, end, -1, walk);
+ }
+ if (pmd_write(*pmdp))
+ write = MIGRATE_PFN_WRITE;
+ } else if (!pmd_present(*pmdp)) {
+ const softleaf_t entry = softleaf_from_pmd(*pmdp);
+
+ folio = softleaf_to_folio(entry);
- folio_get(folio);
+ if (!softleaf_is_device_private(entry) ||
+ !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ (folio->pgmap->owner != migrate->pgmap_owner)) {
spin_unlock(ptl);
- /* FIXME: we don't expect THP for fault_folio */
- if (WARN_ON_ONCE(fault_folio == folio))
- return migrate_vma_collect_skip(start, end,
- walk);
- if (unlikely(!folio_trylock(folio)))
- return migrate_vma_collect_skip(start, end,
- walk);
- ret = split_folio(folio);
- if (fault_folio != folio)
- folio_unlock(folio);
- folio_put(folio);
- if (ret)
- return migrate_vma_collect_skip(start, end,
- walk);
+ return migrate_vma_collect_skip(start, end, walk);
+ }
+
+ if (softleaf_is_migration(entry)) {
+ migration_entry_wait_on_locked(entry, ptl);
+ spin_unlock(ptl);
+ return -EAGAIN;
}
+
+ if (softleaf_is_device_private_write(entry))
+ write = MIGRATE_PFN_WRITE;
+ } else {
+ spin_unlock(ptl);
+ return -EAGAIN;
}
- ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ folio_get(folio);
+ if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
+ spin_unlock(ptl);
+ folio_put(folio);
+ return migrate_vma_collect_skip(start, end, walk);
+ }
+
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+
+ struct page_vma_mapped_walk pvmw = {
+ .ptl = ptl,
+ .address = start,
+ .pmd = pmdp,
+ .vma = walk->vma,
+ };
+
+ unsigned long pfn = page_to_pfn(folio_page(folio, 0));
+
+ migrate->src[migrate->npages] = migrate_pfn(pfn) | write
+ | MIGRATE_PFN_MIGRATE
+ | MIGRATE_PFN_COMPOUND;
+ migrate->dst[migrate->npages++] = 0;
+ migrate->cpages++;
+ ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
+ if (ret) {
+ migrate->npages--;
+ migrate->cpages--;
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ goto fallback;
+ }
+ migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+ spin_unlock(ptl);
+ return 0;
+ }
+
+fallback:
+ spin_unlock(ptl);
+ if (!folio_test_large(folio))
+ goto done;
+ ret = split_folio(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
+ folio_put(folio);
+ if (ret)
+ return migrate_vma_collect_skip(start, end, walk);
+ if (pmd_none(pmdp_get_lockless(pmdp)))
+ return migrate_vma_collect_hole(start, end, -1, walk);
+
+done:
+ return -ENOENT;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ struct folio *fault_folio = migrate->fault_page ?
+ page_folio(migrate->fault_page) : NULL;
+ pte_t *ptep;
+
+again:
+ if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) {
+ int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio);
+
+ if (ret == -EAGAIN)
+ goto again;
+ if (ret == 0)
+ return 0;
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, start, &ptl);
if (!ptep)
goto again;
arch_enter_lazy_mmu_mode();
+ ptep += (addr - start) / PAGE_SIZE;
for (; addr < end; addr += PAGE_SIZE, ptep++) {
struct dev_pagemap *pgmap;
unsigned long mpfn = 0, pfn;
struct folio *folio;
struct page *page;
- swp_entry_t entry;
+ softleaf_t entry;
pte_t pte;
pte = ptep_get(ptep);
@@ -136,20 +298,39 @@ again:
* page table entry. Other special swap entries are not
* migratable, and we ignore regular swapped page.
*/
- entry = pte_to_swp_entry(pte);
- if (!is_device_private_entry(entry))
+ entry = softleaf_from_pte(pte);
+ if (!softleaf_is_device_private(entry))
goto next;
- page = pfn_swap_entry_to_page(entry);
+ page = softleaf_to_page(entry);
pgmap = page_pgmap(page);
if (!(migrate->flags &
MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
pgmap->owner != migrate->pgmap_owner)
goto next;
+ folio = page_folio(page);
+ if (folio_test_large(folio)) {
+ int ret;
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep, ptl);
+ ret = migrate_vma_split_folio(folio,
+ migrate->fault_page);
+
+ if (ret) {
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ return migrate_vma_collect_skip(addr, end, walk);
+ }
+
+ goto again;
+ }
+
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
- if (is_writable_device_private_entry(entry))
+ if (softleaf_is_device_private_write(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
pfn = pte_pfn(pte);
@@ -171,12 +352,29 @@ again:
pgmap->owner != migrate->pgmap_owner)
goto next;
}
+ folio = page ? page_folio(page) : NULL;
+ if (folio && folio_test_large(folio)) {
+ int ret;
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep, ptl);
+ ret = migrate_vma_split_folio(folio,
+ migrate->fault_page);
+
+ if (ret) {
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ return migrate_vma_collect_skip(addr, end, walk);
+ }
+
+ goto again;
+ }
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
- /* FIXME support THP */
- if (!page || !page->mapping || PageTransCompound(page)) {
+ if (!page || !page->mapping) {
mpfn = 0;
goto next;
}
@@ -347,14 +545,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
*/
int extra = 1 + (page == fault_page);
- /*
- * FIXME support THP (transparent huge page), it is bit more complex to
- * check them than regular pages, because they can be mapped with a pmd
- * or with a pte (split pte mapping).
- */
- if (folio_test_large(folio))
- return false;
-
/* Page from ZONE_DEVICE have one extra reference */
if (folio_is_zone_device(folio))
extra++;
@@ -385,17 +575,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
lru_add_drain();
- for (i = 0; i < npages; i++) {
+ for (i = 0; i < npages; ) {
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
+ unsigned int nr = 1;
if (!page) {
if (src_pfns[i] & MIGRATE_PFN_MIGRATE)
unmapped++;
- continue;
+ goto next;
}
folio = page_folio(page);
+ nr = folio_nr_pages(folio);
+
+ if (nr > 1)
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+
+
/* ZONE_DEVICE folios are not on LRU */
if (!folio_is_zone_device(folio)) {
if (!folio_test_lru(folio) && allow_drain) {
@@ -407,7 +604,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
if (!folio_isolate_lru(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
- continue;
+ goto next;
}
/* Drop the reference we took in collect */
@@ -426,10 +623,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
- continue;
+ goto next;
}
unmapped++;
+next:
+ i += nr;
}
for (i = 0; i < npages && restore; i++) {
@@ -575,6 +774,189 @@ int migrate_vma_setup(struct migrate_vma *args)
}
EXPORT_SYMBOL(migrate_vma_setup);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+/**
+ * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
+ * at @addr. folio is already allocated as a part of the migration process with
+ * large page.
+ *
+ * @page needs to be initialized and setup after it's allocated. The code bits
+ * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does
+ * not support THP zero pages.
+ *
+ * @migrate: migrate_vma arguments
+ * @addr: address where the folio will be inserted
+ * @page: page to be inserted at @addr
+ * @src: src pfn which is being migrated
+ * @pmdp: pointer to the pmd
+ */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ pmd_t *pmdp)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ gfp_t gfp = vma_thp_gfp_mask(vma);
+ struct folio *folio = page_folio(page);
+ int ret;
+ vm_fault_t csa_ret;
+ spinlock_t *ptl;
+ pgtable_t pgtable;
+ pmd_t entry;
+ bool flush = false;
+ unsigned long i;
+
+ VM_WARN_ON_FOLIO(!folio, folio);
+ VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp));
+
+ if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER))
+ return -EINVAL;
+
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ return ret;
+
+ folio_set_order(folio, HPAGE_PMD_ORDER);
+ folio_set_large_rmappable(folio);
+
+ if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ ret = -ENOMEM;
+ goto abort;
+ }
+
+ __folio_mark_uptodate(folio);
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable))
+ goto abort;
+
+ if (folio_is_device_private(folio)) {
+ swp_entry_t swp_entry;
+
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
+ entry = swp_entry_to_pmd(swp_entry);
+ } else {
+ if (folio_is_zone_device(folio) &&
+ !folio_is_device_coherent(folio)) {
+ goto abort;
+ }
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pmd_mkwrite(pmd_mkdirty(entry), vma);
+ }
+
+ ptl = pmd_lock(vma->vm_mm, pmdp);
+ csa_ret = check_stable_address_space(vma->vm_mm);
+ if (csa_ret)
+ goto abort;
+
+ /*
+ * Check for userfaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
+
+ if (!pmd_none(*pmdp)) {
+ if (!is_huge_zero_pmd(*pmdp))
+ goto unlock_abort;
+ flush = true;
+ } else if (!pmd_none(*pmdp))
+ goto unlock_abort;
+
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
+ if (!folio_is_zone_device(folio))
+ folio_add_lru_vma(folio, vma);
+ folio_get(folio);
+
+ if (flush) {
+ pte_free(vma->vm_mm, pgtable);
+ flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE);
+ pmdp_invalidate(vma, addr, pmdp);
+ } else {
+ pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable);
+ mm_inc_nr_ptes(vma->vm_mm);
+ }
+ set_pmd_at(vma->vm_mm, addr, pmdp, entry);
+ update_mmu_cache_pmd(vma, addr, pmdp);
+
+ spin_unlock(ptl);
+
+ count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+
+ return 0;
+
+unlock_abort:
+ spin_unlock(ptl);
+abort:
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ src[i] &= ~MIGRATE_PFN_MIGRATE;
+ return 0;
+}
+
+static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
+ unsigned long idx, unsigned long addr,
+ struct folio *folio)
+{
+ unsigned long i;
+ unsigned long pfn;
+ unsigned long flags;
+ int ret = 0;
+
+ folio_get(folio);
+ split_huge_pmd_address(migrate->vma, addr, true);
+ ret = folio_split_unmapped(folio, 0);
+ if (ret)
+ return ret;
+ migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND;
+ flags = migrate->src[idx] & ((1UL << MIGRATE_PFN_SHIFT) - 1);
+ pfn = migrate->src[idx] >> MIGRATE_PFN_SHIFT;
+ for (i = 1; i < HPAGE_PMD_NR; i++)
+ migrate->src[i+idx] = migrate_pfn(pfn + i) | flags;
+ return ret;
+}
+#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ pmd_t *pmdp)
+{
+ return 0;
+}
+
+static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
+ unsigned long idx, unsigned long addr,
+ struct folio *folio)
+{
+ return 0;
+}
+#endif
+
+static unsigned long migrate_vma_nr_pages(unsigned long *src)
+{
+ unsigned long nr = 1;
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ if (*src & MIGRATE_PFN_COMPOUND)
+ nr = HPAGE_PMD_NR;
+#else
+ if (*src & MIGRATE_PFN_COMPOUND)
+ VM_WARN_ON_ONCE(true);
+#endif
+ return nr;
+}
+
/*
* This code closely matches the code in:
* __handle_mm_fault()
@@ -585,9 +967,10 @@ EXPORT_SYMBOL(migrate_vma_setup);
*/
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
- struct page *page,
+ unsigned long *dst,
unsigned long *src)
{
+ struct page *page = migrate_pfn_to_page(*dst);
struct folio *folio = page_folio(page);
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -615,8 +998,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pmdp = pmd_alloc(mm, pudp, addr);
if (!pmdp)
goto abort;
- if (pmd_trans_huge(*pmdp))
- goto abort;
+
+ if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) {
+ int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page,
+ src, pmdp);
+ if (ret)
+ goto abort;
+ return;
+ }
+
+ if (!pmd_none(*pmdp)) {
+ if (pmd_trans_huge(*pmdp)) {
+ if (!is_huge_zero_pmd(*pmdp))
+ goto abort;
+ split_huge_pmd(vma, pmdp, addr);
+ } else if (pmd_leaf(*pmdp))
+ goto abort;
+ }
+
if (pte_alloc(mm, pmdp))
goto abort;
if (unlikely(anon_vma_prepare(vma)))
@@ -704,26 +1103,28 @@ static void __migrate_device_pages(unsigned long *src_pfns,
struct migrate_vma *migrate)
{
struct mmu_notifier_range range;
- unsigned long i;
+ unsigned long i, j;
bool notified = false;
+ unsigned long addr;
- for (i = 0; i < npages; i++) {
+ for (i = 0; i < npages; ) {
struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
struct folio *newfolio, *folio;
int r, extra_cnt = 0;
+ unsigned long nr = 1;
if (!newpage) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
if (!page) {
unsigned long addr;
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
- continue;
+ goto next;
/*
* The only time there is no vma is when called from
@@ -741,15 +1142,57 @@ static void __migrate_device_pages(unsigned long *src_pfns,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
}
- migrate_vma_insert_page(migrate, addr, newpage,
- &src_pfns[i]);
- continue;
+
+ if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) {
+ nr = migrate_vma_nr_pages(&src_pfns[i]);
+ src_pfns[i] &= ~MIGRATE_PFN_COMPOUND;
+ } else {
+ nr = 1;
+ }
+
+ for (j = 0; j < nr && i + j < npages; j++) {
+ src_pfns[i+j] |= MIGRATE_PFN_MIGRATE;
+ migrate_vma_insert_page(migrate,
+ addr + j * PAGE_SIZE,
+ &dst_pfns[i+j], &src_pfns[i+j]);
+ }
+ goto next;
}
newfolio = page_folio(newpage);
folio = page_folio(page);
mapping = folio_mapping(folio);
+ /*
+ * If THP migration is enabled, check if both src and dst
+ * can migrate large pages
+ */
+ if (thp_migration_supported()) {
+ if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+ (src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ !(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+
+ if (!migrate) {
+ src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND);
+ goto next;
+ }
+ nr = 1 << folio_order(folio);
+ addr = migrate->start + i * PAGE_SIZE;
+ if (migrate_vma_split_unmapped_folio(migrate, i, addr, folio)) {
+ src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND);
+ goto next;
+ }
+ } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+ (dst_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+ }
+
+
if (folio_is_device_private(newfolio) ||
folio_is_device_coherent(newfolio)) {
if (mapping) {
@@ -762,7 +1205,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (!folio_test_anon(folio) ||
!folio_free_swap(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
}
} else if (folio_is_zone_device(newfolio)) {
@@ -770,18 +1213,25 @@ static void __migrate_device_pages(unsigned long *src_pfns,
* Other types of ZONE_DEVICE page are not supported.
*/
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
BUG_ON(folio_test_writeback(folio));
if (migrate && migrate->fault_page == page)
extra_cnt = 1;
- r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
- if (r)
- src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- else
- folio_migrate_flags(newfolio, folio);
+ for (j = 0; j < nr && i + j < npages; j++) {
+ folio = page_folio(migrate_pfn_to_page(src_pfns[i+j]));
+ newfolio = page_folio(migrate_pfn_to_page(dst_pfns[i+j]));
+
+ r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
+ if (r)
+ src_pfns[i+j] &= ~MIGRATE_PFN_MIGRATE;
+ else
+ folio_migrate_flags(newfolio, folio);
+ }
+next:
+ i += nr;
}
if (notified)
@@ -943,10 +1393,23 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn)
int migrate_device_range(unsigned long *src_pfns, unsigned long start,
unsigned long npages)
{
- unsigned long i, pfn;
+ unsigned long i, j, pfn;
+
+ for (pfn = start, i = 0; i < npages; pfn++, i++) {
+ struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
+ unsigned int nr = 1;
- for (pfn = start, i = 0; i < npages; pfn++, i++)
src_pfns[i] = migrate_device_pfn_lock(pfn);
+ nr = folio_nr_pages(folio);
+ if (nr > 1) {
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+ for (j = 1; j < nr; j++)
+ src_pfns[i+j] = 0;
+ i += j - 1;
+ pfn += j - 1;
+ }
+ }
migrate_device_unmap(src_pfns, npages, NULL);
@@ -964,10 +1427,22 @@ EXPORT_SYMBOL(migrate_device_range);
*/
int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages)
{
- unsigned long i;
+ unsigned long i, j;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = pfn_to_page(src_pfns[i]);
+ struct folio *folio = page_folio(page);
+ unsigned int nr = 1;
- for (i = 0; i < npages; i++)
src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]);
+ nr = folio_nr_pages(folio);
+ if (nr > 1) {
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+ for (j = 1; j < nr; j++)
+ src_pfns[i+j] = 0;
+ i += j - 1;
+ }
+ }
migrate_device_unmap(src_pfns, npages, NULL);
diff --git a/mm/mincore.c b/mm/mincore.c
index 8ec4719370e1..e5d13eea9234 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -14,7 +14,7 @@
#include <linux/mman.h>
#include <linux/syscalls.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
@@ -32,11 +32,22 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
spinlock_t *ptl;
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+
/*
* Hugepages under user process are always in RAM and never
* swapped out, but theoretically it needs to be checked.
*/
- present = pte && !huge_pte_none_mostly(huge_ptep_get(walk->mm, addr, pte));
+ if (!pte) {
+ present = 0;
+ } else {
+ const pte_t ptep = huge_ptep_get(walk->mm, addr, pte);
+
+ if (huge_pte_none(ptep) || pte_is_marker(ptep))
+ present = 0;
+ else
+ present = 1;
+ }
+
for (; addr != end; vec++, addr += PAGE_SIZE)
*vec = present;
walk->private = vec;
@@ -63,7 +74,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
* absent. Page table may contain migration or hwpoison
* entries which are always uptodate.
*/
- if (non_swap_entry(entry))
+ if (!softleaf_is_swap(entry))
return !shmem;
/*
@@ -175,8 +186,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t pte = ptep_get(ptep);
step = 1;
- /* We need to do cache lookup too for pte markers */
- if (pte_none_mostly(pte))
+ /* We need to do cache lookup too for markers */
+ if (pte_none(pte) || pte_is_marker(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE,
vma, vec);
else if (pte_present(pte)) {
@@ -191,7 +202,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
for (i = 0; i < step; i++)
vec[i] = 1;
} else { /* pte is a swap entry */
- *vec = mincore_swap(pte_to_swp_entry(pte), false);
+ const softleaf_t entry = softleaf_from_pte(pte);
+
+ *vec = mincore_swap(entry, false);
}
vec += step;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index bb0776f5ef7c..2f699c3497a5 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -478,7 +478,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
- vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
+ vma = vma_modify_flags(vmi, *prev, vma, start, end, &newflags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 7712d887b696..c6812b4dbb2e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1909,7 +1909,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
free_area_init_node(nid);
/*
- * No sysfs hierarchy will be created via register_one_node()
+ * No sysfs hierarchy will be created via register_node()
*for memory-less node because here it's not marked as N_MEMORY
*and won't be set online later. The benefit is userspace
*program won't be confused by sysfs files/directories of
diff --git a/mm/mmap.c b/mm/mmap.c
index 5fd3b80fda1d..4bdb9ffa9e25 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -797,12 +797,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
}
#endif
-unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags,
- vm_flags_t vm_flags)
+unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags)
{
- if (mm_flags_test(MMF_TOPDOWN, mm))
+ if (mm_flags_test(MMF_TOPDOWN, current->mm))
return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
flags, vm_flags);
return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
@@ -848,7 +847,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
addr = thp_get_unmapped_area_vmflags(file, addr, len,
pgoff, flags, vm_flags);
} else {
- addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
+ addr = mm_get_unmapped_area_vmflags(file, addr, len,
pgoff, flags, vm_flags);
}
if (IS_ERR_VALUE(addr))
@@ -864,12 +863,10 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
}
unsigned long
-mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
- unsigned long addr, unsigned long len,
+mm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- return mm_get_unmapped_area_vmflags(mm, file, addr, len,
- pgoff, flags, 0);
+ return mm_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, 0);
}
EXPORT_SYMBOL(mm_get_unmapped_area);
@@ -1277,7 +1274,7 @@ void exit_mmap(struct mm_struct *mm)
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
+ unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX);
mmap_read_unlock(mm);
/*
@@ -1451,8 +1448,10 @@ static struct vm_area_struct *__install_special_mapping(
return ERR_PTR(-ENOMEM);
vma_set_range(vma, addr, addr + len, 0);
- vm_flags_init(vma, (vm_flags | mm->def_flags |
- VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
+ vm_flags |= mm->def_flags | VM_DONTEXPAND;
+ if (pgtable_supports_soft_dirty())
+ vm_flags |= VM_SOFTDIRTY;
+ vm_flags_init(vma, vm_flags & ~VM_LOCKED_MASK);
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
vma->vm_ops = ops;
@@ -1750,7 +1749,9 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
for_each_vma(vmi, mpnt) {
struct file *file;
- vma_start_write(mpnt);
+ retval = vma_start_write_killable(mpnt);
+ if (retval < 0)
+ goto loop_out;
if (mpnt->vm_flags & VM_DONTCOPY) {
retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
mpnt->vm_end, GFP_KERNEL);
@@ -1761,14 +1762,6 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
continue;
}
charge = 0;
- /*
- * Don't duplicate many vmas if we've been oom-killed (for
- * example)
- */
- if (fatal_signal_pending(current)) {
- retval = -EINTR;
- goto loop_out;
- }
if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt);
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 42e3dde73e74..7421b7ea8001 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -45,10 +45,19 @@ EXPORT_SYMBOL(__mmap_lock_do_trace_released);
#ifdef CONFIG_MMU
#ifdef CONFIG_PER_VMA_LOCK
-static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+/*
+ * __vma_enter_locked() returns 0 immediately if the vma is not
+ * attached, otherwise it waits for any current readers to finish and
+ * returns 1. Returns -EINTR if a signal is received while waiting.
+ */
+static inline int __vma_enter_locked(struct vm_area_struct *vma,
+ bool detaching, int state)
{
+ int err;
unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+ mmap_assert_write_locked(vma->vm_mm);
+
/* Additional refcnt if the vma is attached. */
if (!detaching)
tgt_refcnt++;
@@ -58,15 +67,27 @@ static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching
* vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
*/
if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
- return false;
+ return 0;
rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
- rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+ err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
refcount_read(&vma->vm_refcnt) == tgt_refcnt,
- TASK_UNINTERRUPTIBLE);
+ state);
+ if (err) {
+ if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) {
+ /*
+ * The wait failed, but the last reader went away
+ * as well. Tell the caller the VMA is detached.
+ */
+ WARN_ON_ONCE(!detaching);
+ err = 0;
+ }
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+ return err;
+ }
lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
- return true;
+ return 1;
}
static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
@@ -75,16 +96,14 @@ static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
}
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
+int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
+ int state)
{
- bool locked;
+ int locked;
- /*
- * __vma_enter_locked() returns false immediately if the vma is not
- * attached, otherwise it waits until refcnt is indicating that vma
- * is attached with no readers.
- */
- locked = __vma_enter_locked(vma, false);
+ locked = __vma_enter_locked(vma, false, state);
+ if (locked < 0)
+ return locked;
/*
* We should use WRITE_ONCE() here because we can have concurrent reads
@@ -100,6 +119,8 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
__vma_exit_locked(vma, &detached);
WARN_ON_ONCE(detached); /* vma should remain attached */
}
+
+ return 0;
}
EXPORT_SYMBOL_GPL(__vma_start_write);
@@ -118,7 +139,7 @@ void vma_mark_detached(struct vm_area_struct *vma)
*/
if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
/* Wait until vma is detached with no readers. */
- if (__vma_enter_locked(vma, true)) {
+ if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) {
bool detached;
__vma_exit_locked(vma, &detached);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 374aa6f021c6..247e3f9db6c7 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -9,8 +9,8 @@
#include <linux/smp.h>
#include <linux/swap.h>
#include <linux/rmap.h>
+#include <linux/pgalloc.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#ifndef CONFIG_MMU_GATHER_NO_GATHER
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 113b48985834..283889e4f1ce 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,9 +29,7 @@
#include <linux/uaccess.h>
#include <linux/mm_inline.h>
#include <linux/pgtable.h>
-#include <linux/sched/sysctl.h>
#include <linux/userfaultfd_k.h>
-#include <linux/memory-tiers.h>
#include <uapi/linux/mman.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
@@ -118,62 +116,6 @@ static int mprotect_folio_pte_batch(struct folio *folio, pte_t *ptep,
return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr_ptes, flags);
}
-static bool prot_numa_skip(struct vm_area_struct *vma, unsigned long addr,
- pte_t oldpte, pte_t *pte, int target_node,
- struct folio *folio)
-{
- bool ret = true;
- bool toptier;
- int nid;
-
- /* Avoid TLB flush if possible */
- if (pte_protnone(oldpte))
- goto skip;
-
- if (!folio)
- goto skip;
-
- if (folio_is_zone_device(folio) || folio_test_ksm(folio))
- goto skip;
-
- /* Also skip shared copy-on-write pages */
- if (is_cow_mapping(vma->vm_flags) &&
- (folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio)))
- goto skip;
-
- /*
- * While migration can move some dirty pages,
- * it cannot move them all from MIGRATE_ASYNC
- * context.
- */
- if (folio_is_file_lru(folio) && folio_test_dirty(folio))
- goto skip;
-
- /*
- * Don't mess with PTEs if page is already on the node
- * a single-threaded process is running on.
- */
- nid = folio_nid(folio);
- if (target_node == nid)
- goto skip;
-
- toptier = node_is_toptier(nid);
-
- /*
- * Skip scanning top tier node if normal numa
- * balancing is disabled
- */
- if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier)
- goto skip;
-
- ret = false;
- if (folio_use_access_time(folio))
- folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
-
-skip:
- return ret;
-}
-
/* Set nr_ptes number of ptes, starting from idx */
static void prot_commit_flush_ptes(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes,
@@ -276,7 +218,7 @@ static long change_pte_range(struct mmu_gather *tlb,
pte_t *pte, oldpte;
spinlock_t *ptl;
long pages = 0;
- int target_node = NUMA_NO_NODE;
+ bool is_private_single_threaded;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
@@ -287,10 +229,8 @@ static long change_pte_range(struct mmu_gather *tlb,
if (!pte)
return -EAGAIN;
- /* Get target node for single threaded private VMAs */
- if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
- atomic_read(&vma->vm_mm->mm_users) == 1)
- target_node = numa_node_id();
+ if (prot_numa)
+ is_private_single_threaded = vma_is_single_threaded_private(vma);
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
@@ -304,23 +244,26 @@ static long change_pte_range(struct mmu_gather *tlb,
struct page *page;
pte_t ptent;
+ /* Already in the desired state. */
+ if (prot_numa && pte_protnone(oldpte))
+ continue;
+
page = vm_normal_page(vma, addr, oldpte);
if (page)
folio = page_folio(page);
+
/*
* Avoid trapping faults against the zero or KSM
* pages. See similar comment in change_huge_pmd.
*/
- if (prot_numa) {
- int ret = prot_numa_skip(vma, addr, oldpte, pte,
- target_node, folio);
- if (ret) {
+ if (prot_numa &&
+ !folio_can_map_prot_numa(folio, vma,
+ is_private_single_threaded)) {
- /* determine batch to skip */
- nr_ptes = mprotect_folio_pte_batch(folio,
- pte, oldpte, max_nr_ptes, /* flags = */ 0);
- continue;
- }
+ /* determine batch to skip */
+ nr_ptes = mprotect_folio_pte_batch(folio,
+ pte, oldpte, max_nr_ptes, /* flags = */ 0);
+ continue;
}
nr_ptes = mprotect_folio_pte_batch(folio, pte, oldpte, max_nr_ptes, flags);
@@ -354,12 +297,31 @@ static long change_pte_range(struct mmu_gather *tlb,
prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent,
nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
pages += nr_ptes;
- } else if (is_swap_pte(oldpte)) {
- swp_entry_t entry = pte_to_swp_entry(oldpte);
+ } else if (pte_none(oldpte)) {
+ /*
+ * Nobody plays with any none ptes besides
+ * userfaultfd when applying the protections.
+ */
+ if (likely(!uffd_wp))
+ continue;
+
+ if (userfaultfd_wp_use_markers(vma)) {
+ /*
+ * For file-backed mem, we need to be able to
+ * wr-protect a none pte, because even if the
+ * pte is none, the page/swap cache could
+ * exist. Doing that by install a marker.
+ */
+ set_pte_at(vma->vm_mm, addr, pte,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ pages++;
+ }
+ } else {
+ softleaf_t entry = softleaf_from_pte(oldpte);
pte_t newpte;
- if (is_writable_migration_entry(entry)) {
- struct folio *folio = pfn_swap_entry_folio(entry);
+ if (softleaf_is_migration_write(entry)) {
+ const struct folio *folio = softleaf_to_folio(entry);
/*
* A protection check is difficult so
@@ -373,7 +335,7 @@ static long change_pte_range(struct mmu_gather *tlb,
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
- } else if (is_writable_device_private_entry(entry)) {
+ } else if (softleaf_is_device_private_write(entry)) {
/*
* We do not preserve soft-dirtiness. See
* copy_nonpresent_pte() for explanation.
@@ -383,14 +345,14 @@ static long change_pte_range(struct mmu_gather *tlb,
newpte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
- } else if (is_pte_marker_entry(entry)) {
+ } else if (softleaf_is_marker(entry)) {
/*
* Ignore error swap entries unconditionally,
* because any access should sigbus/sigsegv
* anyway.
*/
- if (is_poisoned_swp_entry(entry) ||
- is_guard_swp_entry(entry))
+ if (softleaf_is_poison_marker(entry) ||
+ softleaf_is_guard_marker(entry))
continue;
/*
* If this is uffd-wp pte marker and we'd like
@@ -415,28 +377,6 @@ static long change_pte_range(struct mmu_gather *tlb,
set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
- } else {
- /* It must be an none page, or what else?.. */
- WARN_ON_ONCE(!pte_none(oldpte));
-
- /*
- * Nobody plays with any none ptes besides
- * userfaultfd when applying the protections.
- */
- if (likely(!uffd_wp))
- continue;
-
- if (userfaultfd_wp_use_markers(vma)) {
- /*
- * For file-backed mem, we need to be able to
- * wr-protect a none pte, because even if the
- * pte is none, the page/swap cache could
- * exist. Doing that by install a marker.
- */
- set_pte_at(vma->vm_mm, addr, pte,
- make_pte_marker(PTE_MARKER_UFFD_WP));
- pages++;
- }
}
} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
@@ -534,7 +474,7 @@ again:
goto next;
_pmd = pmdp_get_lockless(pmd);
- if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) {
+ if (pmd_is_huge(_pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
pgtable_split_needed(vma, cp_flags)) {
__split_huge_pmd(vma, pmd, addr, false);
@@ -599,7 +539,7 @@ again:
break;
}
- pud = READ_ONCE(*pudp);
+ pud = pudp_get(pudp);
if (pud_none(pud))
continue;
@@ -813,7 +753,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
newflags &= ~VM_ACCOUNT;
}
- vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags);
+ vma = vma_modify_flags(vmi, *pprev, vma, start, end, &newflags);
if (IS_ERR(vma)) {
error = PTR_ERR(vma);
goto fail;
diff --git a/mm/mremap.c b/mm/mremap.c
index 419a0ea0a870..672264807db6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -17,7 +17,7 @@
#include <linux/swap.h>
#include <linux/capability.h>
#include <linux/fs.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
@@ -25,10 +25,10 @@
#include <linux/uaccess.h>
#include <linux/userfaultfd_k.h>
#include <linux/mempolicy.h>
+#include <linux/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
-#include <asm/pgalloc.h>
#include "internal.h"
@@ -158,16 +158,20 @@ static void drop_rmap_locks(struct vm_area_struct *vma)
static pte_t move_soft_dirty_pte(pte_t pte)
{
+ if (pte_none(pte))
+ return pte;
+
/*
* Set soft dirty bit so we can notice
* in userspace the ptes were moved.
*/
-#ifdef CONFIG_MEM_SOFT_DIRTY
- if (pte_present(pte))
- pte = pte_mksoft_dirty(pte);
- else if (is_swap_pte(pte))
- pte = pte_swp_mksoft_dirty(pte);
-#endif
+ if (pgtable_supports_soft_dirty()) {
+ if (pte_present(pte))
+ pte = pte_mksoft_dirty(pte);
+ else
+ pte = pte_swp_mksoft_dirty(pte);
+ }
+
return pte;
}
@@ -288,13 +292,13 @@ static int move_ptes(struct pagetable_move_control *pmc,
pte = move_pte(pte, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
- if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
+ if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte))
pte_clear(mm, new_addr, new_ptep);
else {
if (need_clear_uffd_wp) {
if (pte_present(pte))
pte = pte_clear_uffd_wp(pte);
- else if (is_swap_pte(pte))
+ else
pte = pte_swp_clear_uffd_wp(pte);
}
set_ptes(mm, new_addr, new_ptep, pte, nr_ptes);
@@ -847,7 +851,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc)
if (!new_pmd)
break;
again:
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
+ if (pmd_is_huge(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE &&
move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
continue;
diff --git a/mm/mseal.c b/mm/mseal.c
index e5b205562d2e..ae442683c5c0 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -66,12 +66,13 @@ static int mseal_apply(struct mm_struct *mm,
prev = vma;
for_each_vma_range(vmi, vma, end) {
- unsigned long curr_end = MIN(vma->vm_end, end);
+ const unsigned long curr_end = MIN(vma->vm_end, end);
if (!(vma->vm_flags & VM_SEALED)) {
- vma = vma_modify_flags(&vmi, prev, vma,
- curr_start, curr_end,
- vma->vm_flags | VM_SEALED);
+ vm_flags_t vm_flags = vma->vm_flags | VM_SEALED;
+
+ vma = vma_modify_flags(&vmi, prev, vma, curr_start,
+ curr_end, &vm_flags);
if (IS_ERR(vma))
return PTR_ERR(vma);
vm_flags_set(vma, VM_SEALED);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c145b0feecc1..5eb11fbba704 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -472,6 +472,7 @@ static void dump_header(struct oom_control *oc)
if (should_dump_unreclaim_slab())
dump_unreclaimable_slab();
}
+ mem_cgroup_show_protected_memory(oc->memcg);
if (sysctl_oom_dump_tasks)
dump_tasks(oc);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a124ab6a205d..ccdeb0e84d39 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2652,7 +2652,7 @@ static void folio_account_dirtied(struct folio *folio,
inode_attach_wb(inode, folio);
wb = inode_to_wb(inode);
- __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
+ lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
__zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
__node_stat_mod_folio(folio, NR_DIRTIED, nr);
wb_stat_mod(wb, WB_RECLAIMABLE, nr);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4074c07d02ca..822e05f1a964 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -99,9 +99,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
/*
* On SMP, spin_trylock is sufficient protection.
* On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ * Pass flags to a no-op inline function to typecheck and silence the unused
+ * variable warning.
*/
-#define pcp_trylock_prepare(flags) do { } while (0)
-#define pcp_trylock_finish(flag) do { } while (0)
+static inline void __pcp_trylock_noop(unsigned long *flags) { }
+#define pcp_trylock_prepare(flags) __pcp_trylock_noop(&(flags))
+#define pcp_trylock_finish(flags) __pcp_trylock_noop(&(flags))
#else
/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
@@ -129,15 +132,6 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
* Generic helper to lookup and a per-cpu variable with an embedded spinlock.
* Return value should be used with equivalent unlock helper.
*/
-#define pcpu_spin_lock(type, member, ptr) \
-({ \
- type *_ret; \
- pcpu_task_pin(); \
- _ret = this_cpu_ptr(ptr); \
- spin_lock(&_ret->member); \
- _ret; \
-})
-
#define pcpu_spin_trylock(type, member, ptr) \
({ \
type *_ret; \
@@ -157,14 +151,21 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
})
/* struct per_cpu_pages specific helpers. */
-#define pcp_spin_lock(ptr) \
- pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
-
-#define pcp_spin_trylock(ptr) \
- pcpu_spin_trylock(struct per_cpu_pages, lock, ptr)
+#define pcp_spin_trylock(ptr, UP_flags) \
+({ \
+ struct per_cpu_pages *__ret; \
+ pcp_trylock_prepare(UP_flags); \
+ __ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr); \
+ if (!__ret) \
+ pcp_trylock_finish(UP_flags); \
+ __ret; \
+})
-#define pcp_spin_unlock(ptr) \
- pcpu_spin_unlock(lock, ptr)
+#define pcp_spin_unlock(ptr, UP_flags) \
+({ \
+ pcpu_spin_unlock(lock, ptr); \
+ pcp_trylock_finish(UP_flags); \
+})
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -2552,10 +2553,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* Called from the vmstat counter updater to decay the PCP high.
* Return whether there are addition works to do.
*/
-int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
+bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
{
- int high_min, to_drain, batch;
- int todo = 0;
+ int high_min, to_drain, to_drain_batched, batch;
+ bool todo = false;
high_min = READ_ONCE(pcp->high_min);
batch = READ_ONCE(pcp->batch);
@@ -2568,15 +2569,18 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
pcp->high - (pcp->high >> 3), high_min);
if (pcp->high > high_min)
- todo++;
+ todo = true;
}
to_drain = pcp->count - pcp->high;
- if (to_drain > 0) {
+ while (to_drain > 0) {
+ to_drain_batched = min(to_drain, batch);
spin_lock(&pcp->lock);
- free_pcppages_bulk(zone, to_drain, pcp, 0);
+ free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
spin_unlock(&pcp->lock);
- todo++;
+ todo = true;
+
+ to_drain -= to_drain_batched;
}
return todo;
@@ -2810,12 +2814,22 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
return high;
}
-static void free_frozen_page_commit(struct zone *zone,
+/*
+ * Tune pcp alloc factor and adjust count & free_count. Free pages to bring the
+ * pcp's watermarks below high.
+ *
+ * May return a freed pcp, if during page freeing the pcp spinlock cannot be
+ * reacquired. Return true if pcp is locked, false otherwise.
+ */
+static bool free_frozen_page_commit(struct zone *zone,
struct per_cpu_pages *pcp, struct page *page, int migratetype,
- unsigned int order, fpi_t fpi_flags)
+ unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags)
{
int high, batch;
+ int to_free, to_free_batched;
int pindex;
+ int cpu = smp_processor_id();
+ int ret = true;
bool free_high = false;
/*
@@ -2853,15 +2867,42 @@ static void free_frozen_page_commit(struct zone *zone,
* Do not attempt to take a zone lock. Let pcp->count get
* over high mark temporarily.
*/
- return;
+ return true;
}
high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count < high)
- return;
+ return true;
+
+ to_free = nr_pcp_free(pcp, batch, high, free_high);
+ while (to_free > 0 && pcp->count > 0) {
+ to_free_batched = min(to_free, batch);
+ free_pcppages_bulk(zone, to_free_batched, pcp, pindex);
+ to_free -= to_free_batched;
+
+ if (to_free == 0 || pcp->count == 0)
+ break;
+
+ pcp_spin_unlock(pcp, *UP_flags);
+
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags);
+ if (!pcp) {
+ ret = false;
+ break;
+ }
+
+ /*
+ * Check if this thread has been migrated to a different CPU.
+ * If that is the case, give up and indicate that the pcp is
+ * returned in an unlocked state.
+ */
+ if (smp_processor_id() != cpu) {
+ pcp_spin_unlock(pcp, *UP_flags);
+ ret = false;
+ break;
+ }
+ }
- free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
- pcp, pindex);
if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
zone_watermark_ok(zone, 0, high_wmark_pages(zone),
ZONE_MOVABLE, 0)) {
@@ -2879,6 +2920,7 @@ static void free_frozen_page_commit(struct zone *zone,
next_memory_node(pgdat->node_id) < MAX_NUMNODES)
atomic_set(&pgdat->kswapd_failures, 0);
}
+ return ret;
}
/*
@@ -2887,7 +2929,7 @@ static void free_frozen_page_commit(struct zone *zone,
static void __free_frozen_pages(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
struct per_cpu_pages *pcp;
struct zone *zone;
unsigned long pfn = page_to_pfn(page);
@@ -2923,15 +2965,15 @@ static void __free_frozen_pages(struct page *page, unsigned int order,
add_page_to_zone_llist(zone, page, order);
return;
}
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
if (pcp) {
- free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
- pcp_spin_unlock(pcp);
+ if (!free_frozen_page_commit(zone, pcp, page, migratetype,
+ order, fpi_flags, &UP_flags))
+ return;
+ pcp_spin_unlock(pcp, UP_flags);
} else {
free_one_page(zone, page, pfn, order, fpi_flags);
}
- pcp_trylock_finish(UP_flags);
}
void free_frozen_pages(struct page *page, unsigned int order)
@@ -2944,7 +2986,7 @@ void free_frozen_pages(struct page *page, unsigned int order)
*/
void free_unref_folios(struct folio_batch *folios)
{
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
struct per_cpu_pages *pcp = NULL;
struct zone *locked_zone = NULL;
int i, j;
@@ -2987,8 +3029,7 @@ void free_unref_folios(struct folio_batch *folios)
if (zone != locked_zone ||
is_migrate_isolate(migratetype)) {
if (pcp) {
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
+ pcp_spin_unlock(pcp, UP_flags);
locked_zone = NULL;
pcp = NULL;
}
@@ -3007,10 +3048,8 @@ void free_unref_folios(struct folio_batch *folios)
* trylock is necessary as folios may be getting freed
* from IRQ or SoftIRQ context after an IO completion.
*/
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
if (unlikely(!pcp)) {
- pcp_trylock_finish(UP_flags);
free_one_page(zone, &folio->page, pfn,
order, FPI_NONE);
continue;
@@ -3026,14 +3065,15 @@ void free_unref_folios(struct folio_batch *folios)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(&folio->page);
- free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
- order, FPI_NONE);
+ if (!free_frozen_page_commit(zone, pcp, &folio->page,
+ migratetype, order, FPI_NONE, &UP_flags)) {
+ pcp = NULL;
+ locked_zone = NULL;
+ }
}
- if (pcp) {
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
- }
+ if (pcp)
+ pcp_spin_unlock(pcp, UP_flags);
folio_batch_reinit(folios);
}
@@ -3284,15 +3324,12 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
- if (!pcp) {
- pcp_trylock_finish(UP_flags);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
+ if (!pcp)
return NULL;
- }
/*
* On allocation, reduce the number of pages that are batch freed.
@@ -3302,8 +3339,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
pcp->free_count >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
+ pcp_spin_unlock(pcp, UP_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
@@ -3936,6 +3972,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
filter &= ~SHOW_MEM_FILTER_NODES;
__show_mem(filter, nodemask, gfp_zone(gfp_mask));
+ mem_cgroup_show_protected_memory(NULL);
}
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
@@ -4643,11 +4680,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
if (unlikely(nofail)) {
/*
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with __GFP_NOFAIL.
- */
- WARN_ON_ONCE(order > 1);
- /*
* Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM,
* otherwise, we may result in lockup.
*/
@@ -4995,7 +5027,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
struct page **page_array)
{
struct page *page;
- unsigned long __maybe_unused UP_flags;
+ unsigned long UP_flags;
struct zone *zone;
struct zoneref *z;
struct per_cpu_pages *pcp;
@@ -5089,10 +5121,9 @@ retry_this_zone:
goto failed;
/* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */
- pcp_trylock_prepare(UP_flags);
- pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+ pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags);
if (!pcp)
- goto failed_irq;
+ goto failed;
/* Attempt the batch allocation */
pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
@@ -5109,8 +5140,8 @@ retry_this_zone:
if (unlikely(!page)) {
/* Try and allocate at least one page */
if (!nr_account) {
- pcp_spin_unlock(pcp);
- goto failed_irq;
+ pcp_spin_unlock(pcp, UP_flags);
+ goto failed;
}
break;
}
@@ -5121,8 +5152,7 @@ retry_this_zone:
page_array[nr_populated++] = page;
}
- pcp_spin_unlock(pcp);
- pcp_trylock_finish(UP_flags);
+ pcp_spin_unlock(pcp, UP_flags);
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
@@ -5130,9 +5160,6 @@ retry_this_zone:
out:
return nr_populated;
-failed_irq:
- pcp_trylock_finish(UP_flags);
-
failed:
page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
if (page)
@@ -5860,15 +5887,14 @@ static int zone_batchsize(struct zone *zone)
int batch;
/*
- * The number of pages to batch allocate is either ~0.1%
- * of the zone or 1MB, whichever is smaller. The batch
+ * The number of pages to batch allocate is either ~0.025%
+ * of the zone or 256KB, whichever is smaller. The batch
* size is striking a balance between allocation latency
* and zone lock contention.
*/
- batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
- batch /= 4; /* We effectively *= 4 below */
- if (batch < 1)
- batch = 1;
+ batch = min(zone_managed_pages(zone) >> 12, SZ_256K / PAGE_SIZE);
+ if (batch <= 1)
+ return 1;
/*
* Clamp the batch to a 2^n - 1 value. Having a power
@@ -6019,7 +6045,7 @@ static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
{
int new_high_min, new_high_max, new_batch;
- new_batch = max(1, zone_batchsize(zone));
+ new_batch = zone_batchsize(zone);
if (percpu_pagelist_high_fraction) {
new_high_min = zone_highsize(zone, new_batch, cpu_online,
percpu_pagelist_high_fraction);
@@ -6285,10 +6311,21 @@ static void calculate_totalreserve_pages(void)
long max = 0;
unsigned long managed_pages = zone_managed_pages(zone);
- /* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++)
- max = max(max, zone->lowmem_reserve[j]);
+ /*
+ * lowmem_reserve[j] is monotonically non-decreasing
+ * in j for a given zone (see
+ * setup_per_zone_lowmem_reserve()). The maximum
+ * valid reserve lives at the highest index with a
+ * non-zero value, so scan backwards and stop at the
+ * first hit.
+ */
+ for (j = MAX_NR_ZONES - 1; j > i; j--) {
+ if (!zone->lowmem_reserve[j])
+ continue;
+ max = zone->lowmem_reserve[j];
+ break;
+ }
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
@@ -6313,7 +6350,21 @@ static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
enum zone_type i, j;
-
+ /*
+ * For a given zone node_zones[i], lowmem_reserve[j] (j > i)
+ * represents how many pages in zone i must effectively be kept
+ * in reserve when deciding whether an allocation class that is
+ * allowed to allocate from zones up to j may fall back into
+ * zone i.
+ *
+ * As j increases, the allocation class can use a strictly larger
+ * set of fallback zones and therefore must not be allowed to
+ * deplete low zones more aggressively than a less flexible one.
+ * As a result, lowmem_reserve[j] is required to be monotonically
+ * non-decreasing in j for each zone i. Callers such as
+ * calculate_totalreserve_pages() rely on this monotonicity when
+ * selecting the maximum reserve entry.
+ */
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES - 1; i++) {
struct zone *zone = &pgdat->node_zones[i];
diff --git a/mm/page_idle.c b/mm/page_idle.c
index a82b340dc204..96bb94c7b6c3 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -71,8 +71,11 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio,
referenced |= ptep_test_and_clear_young(vma, addr, pvmw.pte);
referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE);
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
- referenced = true;
+ pmd_t pmdval = pmdp_get(pvmw.pmd);
+
+ if (likely(pmd_present(pmdval)))
+ referenced |= pmdp_clear_young_notify(vma, addr, pvmw.pmd);
+ referenced |= mmu_notifier_clear_young(vma->vm_mm, addr, addr + PMD_SIZE);
} else {
/* unexpected pmd-mapped page? */
WARN_ON_ONCE(1);
@@ -101,19 +104,15 @@ static void page_idle_clear_pte_refs(struct folio *folio)
.rmap_one = page_idle_clear_pte_refs_one,
.anon_lock = folio_lock_anon_vma_read,
};
- bool need_lock;
if (!folio_mapped(folio) || !folio_raw_mapping(folio))
return;
- need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
- if (need_lock && !folio_trylock(folio))
+ if (!folio_trylock(folio))
return;
rmap_walk(folio, &rwc);
-
- if (need_lock)
- folio_unlock(folio);
+ folio_unlock(folio);
}
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 589ec37c94aa..a70245684206 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -45,6 +45,15 @@ static struct stack failure_stack;
static struct stack *stack_list;
static DEFINE_SPINLOCK(stack_list_lock);
+#define STACK_PRINT_FLAG_STACK 0x1
+#define STACK_PRINT_FLAG_PAGES 0x2
+#define STACK_PRINT_FLAG_HANDLE 0x4
+
+struct stack_print_ctx {
+ struct stack *stack;
+ u8 flags;
+};
+
static bool page_owner_enabled __initdata;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
@@ -760,7 +769,7 @@ static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig)
return file->f_pos;
}
-static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
+static void init_pages_in_zone(struct zone *zone)
{
unsigned long pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
@@ -827,31 +836,18 @@ ext_put_continue:
}
pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
- pgdat->node_id, zone->name, count);
-}
-
-static void init_zones_in_node(pg_data_t *pgdat)
-{
- struct zone *zone;
- struct zone *node_zones = pgdat->node_zones;
-
- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!populated_zone(zone))
- continue;
-
- init_pages_in_zone(pgdat, zone);
- }
+ zone->zone_pgdat->node_id, zone->name, count);
}
static void init_early_allocated_pages(void)
{
- pg_data_t *pgdat;
+ struct zone *zone;
- for_each_online_pgdat(pgdat)
- init_zones_in_node(pgdat);
+ for_each_populated_zone(zone)
+ init_pages_in_zone(zone);
}
-static const struct file_operations proc_page_owner_operations = {
+static const struct file_operations page_owner_fops = {
.read = read_page_owner,
.llseek = lseek_page_owner,
};
@@ -859,6 +855,7 @@ static const struct file_operations proc_page_owner_operations = {
static void *stack_start(struct seq_file *m, loff_t *ppos)
{
struct stack *stack;
+ struct stack_print_ctx *ctx = m->private;
if (*ppos == -1UL)
return NULL;
@@ -870,9 +867,9 @@ static void *stack_start(struct seq_file *m, loff_t *ppos)
* value of stack_list.
*/
stack = smp_load_acquire(&stack_list);
- m->private = stack;
+ ctx->stack = stack;
} else {
- stack = m->private;
+ stack = ctx->stack;
}
return stack;
@@ -881,10 +878,11 @@ static void *stack_start(struct seq_file *m, loff_t *ppos)
static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
{
struct stack *stack = v;
+ struct stack_print_ctx *ctx = m->private;
stack = stack->next;
*ppos = stack ? *ppos + 1 : -1UL;
- m->private = stack;
+ ctx->stack = stack;
return stack;
}
@@ -898,20 +896,28 @@ static int stack_print(struct seq_file *m, void *v)
unsigned long *entries;
unsigned long nr_entries;
struct stack_record *stack_record = stack->stack_record;
+ struct stack_print_ctx *ctx = m->private;
if (!stack->stack_record)
return 0;
- nr_entries = stack_record->size;
- entries = stack_record->entries;
nr_base_pages = refcount_read(&stack_record->count) - 1;
- if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
+ if (ctx->flags & STACK_PRINT_FLAG_PAGES &&
+ (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold))
return 0;
- for (i = 0; i < nr_entries; i++)
- seq_printf(m, " %pS\n", (void *)entries[i]);
- seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);
+ if (ctx->flags & STACK_PRINT_FLAG_STACK) {
+ nr_entries = stack_record->size;
+ entries = stack_record->entries;
+ for (i = 0; i < nr_entries; i++)
+ seq_printf(m, " %pS\n", (void *)entries[i]);
+ }
+ if (ctx->flags & STACK_PRINT_FLAG_HANDLE)
+ seq_printf(m, "handle: %d\n", stack_record->handle.handle);
+ if (ctx->flags & STACK_PRINT_FLAG_PAGES)
+ seq_printf(m, "nr_base_pages: %d\n", nr_base_pages);
+ seq_putc(m, '\n');
return 0;
}
@@ -929,10 +935,20 @@ static const struct seq_operations page_owner_stack_op = {
static int page_owner_stack_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &page_owner_stack_op, 0);
+ int ret = seq_open_private(file, &page_owner_stack_op,
+ sizeof(struct stack_print_ctx));
+
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ struct stack_print_ctx *ctx = m->private;
+
+ ctx->flags = (uintptr_t) inode->i_private;
+ }
+
+ return ret;
}
-static const struct file_operations page_owner_stack_operations = {
+static const struct file_operations page_owner_stack_fops = {
.open = page_owner_stack_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -951,7 +967,7 @@ static int page_owner_threshold_set(void *data, u64 val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(proc_page_owner_threshold, &page_owner_threshold_get,
+DEFINE_SIMPLE_ATTRIBUTE(page_owner_threshold_fops, &page_owner_threshold_get,
&page_owner_threshold_set, "%llu");
@@ -964,14 +980,22 @@ static int __init pageowner_init(void)
return 0;
}
- debugfs_create_file("page_owner", 0400, NULL, NULL,
- &proc_page_owner_operations);
+ debugfs_create_file("page_owner", 0400, NULL, NULL, &page_owner_fops);
dir = debugfs_create_dir("page_owner_stacks", NULL);
- debugfs_create_file("show_stacks", 0400, dir, NULL,
- &page_owner_stack_operations);
+ debugfs_create_file("show_stacks", 0400, dir,
+ (void *)(STACK_PRINT_FLAG_STACK |
+ STACK_PRINT_FLAG_PAGES),
+ &page_owner_stack_fops);
+ debugfs_create_file("show_handles", 0400, dir,
+ (void *)(STACK_PRINT_FLAG_HANDLE |
+ STACK_PRINT_FLAG_PAGES),
+ &page_owner_stack_fops);
+ debugfs_create_file("show_stacks_handles", 0400, dir,
+ (void *)(STACK_PRINT_FLAG_STACK |
+ STACK_PRINT_FLAG_HANDLE),
+ &page_owner_stack_fops);
debugfs_create_file("count_threshold", 0600, dir, NULL,
- &proc_page_owner_threshold);
-
+ &page_owner_threshold_fops);
return 0;
}
late_initcall(pageowner_init)
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 4eeca782b888..741884645ab0 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -8,7 +8,7 @@
#include <linux/mm.h>
#include <linux/page_table_check.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#undef pr_fmt
#define pr_fmt(fmt) "page_table_check: " fmt
@@ -179,18 +179,21 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
EXPORT_SYMBOL(__page_table_check_pud_clear);
/* Whether the swap entry cached writable information */
-static inline bool swap_cached_writable(swp_entry_t entry)
+static inline bool softleaf_cached_writable(softleaf_t entry)
{
- return is_writable_device_private_entry(entry) ||
- is_writable_migration_entry(entry);
+ return softleaf_is_device_private_write(entry) ||
+ softleaf_is_migration_write(entry);
}
-static inline void page_table_check_pte_flags(pte_t pte)
+static void page_table_check_pte_flags(pte_t pte)
{
- if (pte_present(pte) && pte_uffd_wp(pte))
- WARN_ON_ONCE(pte_write(pte));
- else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte))
- WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte)));
+ if (pte_present(pte)) {
+ WARN_ON_ONCE(pte_uffd_wp(pte) && pte_write(pte));
+ } else if (pte_swp_uffd_wp(pte)) {
+ const softleaf_t entry = softleaf_from_pte(pte);
+
+ WARN_ON_ONCE(softleaf_cached_writable(entry));
+ }
}
void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
@@ -212,10 +215,14 @@ EXPORT_SYMBOL(__page_table_check_ptes_set);
static inline void page_table_check_pmd_flags(pmd_t pmd)
{
- if (pmd_present(pmd) && pmd_uffd_wp(pmd))
- WARN_ON_ONCE(pmd_write(pmd));
- else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd))
- WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
+ if (pmd_present(pmd)) {
+ if (pmd_uffd_wp(pmd))
+ WARN_ON_ONCE(pmd_write(pmd));
+ } else if (pmd_swp_uffd_wp(pmd)) {
+ const softleaf_t entry = softleaf_from_pmd(pmd);
+
+ WARN_ON_ONCE(softleaf_cached_writable(entry));
+ }
}
void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c498a91b6706..b38a1d00c971 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -3,7 +3,7 @@
#include <linux/rmap.h>
#include <linux/hugetlb.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include "internal.h"
@@ -16,6 +16,7 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw)
static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
spinlock_t **ptlp)
{
+ bool is_migration;
pte_t ptent;
if (pvmw->flags & PVMW_SYNC) {
@@ -26,6 +27,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp,
return !!pvmw->pte;
}
+ is_migration = pvmw->flags & PVMW_MIGRATION;
again:
/*
* It is important to return the ptl corresponding to pte,
@@ -41,11 +43,14 @@ again:
ptent = ptep_get(pvmw->pte);
- if (pvmw->flags & PVMW_MIGRATION) {
- if (!is_swap_pte(ptent))
+ if (pte_none(ptent)) {
+ return false;
+ } else if (pte_present(ptent)) {
+ if (is_migration)
return false;
- } else if (is_swap_pte(ptent)) {
- swp_entry_t entry;
+ } else if (!is_migration) {
+ softleaf_t entry;
+
/*
* Handle un-addressable ZONE_DEVICE memory.
*
@@ -62,12 +67,10 @@ again:
* For more details on device private memory see HMM
* (include/linux/hmm.h or mm/hmm.c).
*/
- entry = pte_to_swp_entry(ptent);
- if (!is_device_private_entry(entry) &&
- !is_device_exclusive_entry(entry))
+ entry = softleaf_from_pte(ptent);
+ if (!softleaf_is_device_private(entry) &&
+ !softleaf_is_device_exclusive(entry))
return false;
- } else if (!pte_present(ptent)) {
- return false;
}
spin_lock(*ptlp);
if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) {
@@ -107,30 +110,23 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw, unsigned long pte_nr)
pte_t ptent = ptep_get(pvmw->pte);
if (pvmw->flags & PVMW_MIGRATION) {
- swp_entry_t entry;
- if (!is_swap_pte(ptent))
- return false;
- entry = pte_to_swp_entry(ptent);
+ const softleaf_t entry = softleaf_from_pte(ptent);
- if (!is_migration_entry(entry))
+ if (!softleaf_is_migration(entry))
return false;
- pfn = swp_offset_pfn(entry);
- } else if (is_swap_pte(ptent)) {
- swp_entry_t entry;
+ pfn = softleaf_to_pfn(entry);
+ } else if (pte_present(ptent)) {
+ pfn = pte_pfn(ptent);
+ } else {
+ const softleaf_t entry = softleaf_from_pte(ptent);
/* Handle un-addressable ZONE_DEVICE memory */
- entry = pte_to_swp_entry(ptent);
- if (!is_device_private_entry(entry) &&
- !is_device_exclusive_entry(entry))
- return false;
-
- pfn = swp_offset_pfn(entry);
- } else {
- if (!pte_present(ptent))
+ if (!softleaf_is_device_private(entry) &&
+ !softleaf_is_device_exclusive(entry))
return false;
- pfn = pte_pfn(ptent);
+ pfn = softleaf_to_pfn(entry);
}
if ((pfn + pte_nr - 1) < pvmw->pfn)
@@ -246,18 +242,19 @@ restart:
*/
pmde = pmdp_get_lockless(pvmw->pmd);
- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+ if (pmd_trans_huge(pmde) || pmd_is_migration_entry(pmde)) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
if (!pmd_present(pmde)) {
- swp_entry_t entry;
+ softleaf_t entry;
if (!thp_migration_supported() ||
!(pvmw->flags & PVMW_MIGRATION))
return not_found(pvmw);
- entry = pmd_to_swp_entry(pmde);
- if (!is_migration_entry(entry) ||
- !check_pmd(swp_offset_pfn(entry), pvmw))
+ entry = softleaf_from_pmd(pmde);
+
+ if (!softleaf_is_migration(entry) ||
+ !check_pmd(softleaf_to_pfn(entry), pvmw))
return not_found(pvmw);
return true;
}
@@ -277,6 +274,13 @@ restart:
* cannot return prematurely, while zap_huge_pmd() has
* cleared *pmd but not decremented compound_mapcount().
*/
+ const softleaf_t entry = softleaf_from_pmd(pmde);
+
+ if (softleaf_is_device_private(entry)) {
+ pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ return true;
+ }
+
if ((pvmw->flags & PVMW_SYNC) &&
thp_vma_suitable_order(vma, pvmw->address,
PMD_ORDER) &&
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 9f91cf85a5be..90cc346a6ecf 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -5,7 +5,7 @@
#include <linux/hugetlb.h>
#include <linux/mmu_context.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <asm/tlbflush.h>
@@ -452,7 +452,7 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
* We usually restrict the ability to install PTEs, but this functionality is
* available to internal memory management code and provided in mm/internal.h.
*/
-int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
+int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private)
{
@@ -518,10 +518,10 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
* This check is performed on all functions which are parameterised by walk
* operations and exposed in include/linux/pagewalk.h.
*
- * Internal memory management code can use the walk_page_range_mm() function to
- * be able to use all page walking operations.
+ * Internal memory management code can use *_unsafe() functions to be able to
+ * use all page walking operations.
*/
-static bool check_ops_valid(const struct mm_walk_ops *ops)
+static bool check_ops_safe(const struct mm_walk_ops *ops)
{
/*
* The installation of PTEs is solely under the control of memory
@@ -579,10 +579,10 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private)
{
- if (!check_ops_valid(ops))
+ if (!check_ops_safe(ops))
return -EINVAL;
- return walk_page_range_mm(mm, start, end, ops, private);
+ return walk_page_range_mm_unsafe(mm, start, end, ops, private);
}
/**
@@ -639,7 +639,7 @@ int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end
if (start >= end)
return -EINVAL;
- if (!check_ops_valid(ops))
+ if (!check_ops_safe(ops))
return -EINVAL;
return walk_pgd_range(start, end, &walk);
@@ -678,7 +678,7 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
pgd, private);
if (start >= end || !walk.mm)
return -EINVAL;
- if (!check_ops_valid(ops))
+ if (!check_ops_safe(ops))
return -EINVAL;
/*
@@ -694,9 +694,8 @@ int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
return walk_pgd_range(start, end, &walk);
}
-int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, const struct mm_walk_ops *ops,
- void *private)
+int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops, void *private)
{
struct mm_walk walk = {
.ops = ops,
@@ -709,14 +708,22 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
return -EINVAL;
if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
- if (!check_ops_valid(ops))
- return -EINVAL;
process_mm_walk_lock(walk.mm, ops->walk_lock);
process_vma_walk_lock(vma, ops->walk_lock);
return __walk_page_range(start, end, &walk);
}
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
+{
+ if (!check_ops_safe(ops))
+ return -EINVAL;
+
+ return walk_page_range_vma_unsafe(vma, start, end, ops, private);
+}
+
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private)
{
@@ -729,7 +736,7 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
if (!walk.mm)
return -EINVAL;
- if (!check_ops_valid(ops))
+ if (!check_ops_safe(ops))
return -EINVAL;
process_mm_walk_lock(walk.mm, ops->walk_lock);
@@ -780,7 +787,7 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
unsigned long start_addr, end_addr;
int err = 0;
- if (!check_ops_valid(ops))
+ if (!check_ops_safe(ops))
return -EINVAL;
lockdep_assert_held(&mapping->i_mmap_rwsem);
@@ -966,10 +973,10 @@ pmd_table:
goto found;
}
} else if ((flags & FW_MIGRATION) &&
- is_pmd_migration_entry(pmd)) {
- swp_entry_t entry = pmd_to_swp_entry(pmd);
+ pmd_is_migration_entry(pmd)) {
+ const softleaf_t entry = softleaf_from_pmd(pmd);
- page = pfn_swap_entry_to_page(entry);
+ page = softleaf_to_page(entry);
expose_page = false;
goto found;
}
@@ -1000,11 +1007,10 @@ pte_table:
goto found;
}
} else if (!pte_none(pte)) {
- swp_entry_t entry = pte_to_swp_entry(pte);
+ const softleaf_t entry = softleaf_from_pte(pte);
- if ((flags & FW_MIGRATION) &&
- is_migration_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
+ if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) {
+ page = softleaf_to_page(entry);
expose_page = false;
goto found;
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index cd69caf6aa8d..4f5937090590 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -194,7 +194,7 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
int nr_pages)
{
return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
- PAGE_KERNEL, pages, PAGE_SHIFT);
+ PAGE_KERNEL, pages, PAGE_SHIFT, GFP_KERNEL);
}
/**
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 567e2d084071..d3aec7a9926a 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -13,7 +13,9 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mm_inline.h>
-#include <asm/pgalloc.h>
+#include <linux/iommu.h>
+#include <linux/pgalloc.h>
+
#include <asm/tlb.h>
/*
@@ -290,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
if (pmdvalp)
*pmdvalp = pmdval;
- if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
+ if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
goto nomap;
if (unlikely(pmd_trans_huge(pmdval)))
goto nomap;
@@ -406,3 +408,41 @@ again:
pte_unmap_unlock(pte, ptl);
goto again;
}
+
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+static void kernel_pgtable_work_func(struct work_struct *work);
+
+static struct {
+ struct list_head list;
+ /* protect above ptdesc lists */
+ spinlock_t lock;
+ struct work_struct work;
+} kernel_pgtable_work = {
+ .list = LIST_HEAD_INIT(kernel_pgtable_work.list),
+ .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
+ .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
+};
+
+static void kernel_pgtable_work_func(struct work_struct *work)
+{
+ struct ptdesc *pt, *next;
+ LIST_HEAD(page_list);
+
+ spin_lock(&kernel_pgtable_work.lock);
+ list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
+ spin_unlock(&kernel_pgtable_work.lock);
+
+ iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL);
+ list_for_each_entry_safe(pt, next, &page_list, pt_list)
+ __pagetable_free(pt);
+}
+
+void pagetable_free_kernel(struct ptdesc *pt)
+{
+ spin_lock(&kernel_pgtable_work.lock);
+ list_add(&pt->pt_list, &kernel_pgtable_work.list);
+ spin_unlock(&kernel_pgtable_work.lock);
+
+ schedule_work(&kernel_pgtable_work.work);
+}
+#endif
diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c
index 7e9455a18aae..0d9cfbf4fe5d 100644
--- a/mm/pt_reclaim.c
+++ b/mm/pt_reclaim.c
@@ -1,7 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/hugetlb.h>
+#include <linux/pgalloc.h>
+
#include <asm-generic/tlb.h>
-#include <asm/pgalloc.h>
#include "internal.h"
diff --git a/mm/ptdump.c b/mm/ptdump.c
index b600c7f864b8..973020000096 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -31,7 +31,7 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- pgd_t val = READ_ONCE(*pgd);
+ pgd_t val = pgdp_get(pgd);
#if CONFIG_PGTABLE_LEVELS > 4 && \
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
@@ -54,7 +54,7 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- p4d_t val = READ_ONCE(*p4d);
+ p4d_t val = p4dp_get(p4d);
#if CONFIG_PGTABLE_LEVELS > 3 && \
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
@@ -77,7 +77,7 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- pud_t val = READ_ONCE(*pud);
+ pud_t val = pudp_get(pud);
#if CONFIG_PGTABLE_LEVELS > 2 && \
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
@@ -100,7 +100,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- pmd_t val = READ_ONCE(*pmd);
+ pmd_t val = pmdp_get(pmd);
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
@@ -121,7 +121,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- pte_t val = ptep_get_lockless(pte);
+ pte_t val = ptep_get(pte);
if (st->effective_prot_pte)
st->effective_prot_pte(st, val);
diff --git a/mm/rmap.c b/mm/rmap.c
index ac4f783d6ec2..f955f02d570e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -57,7 +57,7 @@
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/ksm.h>
@@ -489,17 +489,15 @@ void __init anon_vma_init(void)
* if there is a mapcount, we can dereference the anon_vma after observing
* those.
*
- * NOTE: the caller should normally hold folio lock when calling this. If
- * not, the caller needs to double check the anon_vma didn't change after
- * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it
- * concurrently without folio lock protection). See folio_lock_anon_vma_read()
- * which has already covered that, and comment above remap_pages().
+ * NOTE: the caller should hold folio lock when calling this.
*/
struct anon_vma *folio_get_anon_vma(const struct folio *folio)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
@@ -546,7 +544,8 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
struct anon_vma *root_anon_vma;
unsigned long anon_mapping;
-retry:
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+
rcu_read_lock();
anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
@@ -558,17 +557,6 @@ retry:
root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
- * folio_move_anon_rmap() might have changed the anon_vma as we
- * might not hold the folio lock here.
- */
- if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
- anon_mapping)) {
- up_read(&root_anon_vma->rwsem);
- rcu_read_unlock();
- goto retry;
- }
-
- /*
* If the folio is still mapped, then this anon_vma is still
* its anon_vma, and holding the mutex ensures that it will
* not go away, see anon_vma_free().
@@ -602,18 +590,6 @@ retry:
rcu_read_unlock();
anon_vma_lock_read(anon_vma);
- /*
- * folio_move_anon_rmap() might have changed the anon_vma as we might
- * not hold the folio lock here.
- */
- if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
- anon_mapping)) {
- anon_vma_unlock_read(anon_vma);
- put_anon_vma(anon_vma);
- anon_vma = NULL;
- goto retry;
- }
-
if (atomic_dec_and_test(&anon_vma->refcount)) {
/*
* Oops, we held the last refcount, release the lock
@@ -988,7 +964,7 @@ int folio_referenced(struct folio *folio, int is_locked,
if (!folio_raw_mapping(folio))
return 0;
- if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
+ if (!is_locked) {
we_locked = folio_trylock(folio);
if (!we_locked)
return 1;
@@ -1046,9 +1022,16 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmd_t *pmd = pvmw->pmd;
- pmd_t entry;
+ pmd_t entry = pmdp_get(pmd);
- if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
+ /*
+ * Please see the comment above (!pte_present).
+ * A non present PMD is not writable from a CPU
+ * perspective.
+ */
+ if (!pmd_present(entry))
+ continue;
+ if (!pmd_dirty(entry) && !pmd_write(entry))
continue;
flush_cache_range(vma, address,
@@ -1229,12 +1212,12 @@ static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
if (nr) {
idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
- __lruvec_stat_mod_folio(folio, idx, nr);
+ lruvec_stat_mod_folio(folio, idx, nr);
}
if (nr_pmdmapped) {
if (folio_test_anon(folio)) {
idx = NR_ANON_THPS;
- __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
+ lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
} else {
/* NR_*_PMDMAPPED are not maintained per-memcg */
idx = folio_test_swapbacked(folio) ?
@@ -1757,9 +1740,13 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
* the folio is unmapped and at least one page is still mapped.
*
* Check partially_mapped first to ensure it is a large folio.
+ *
+ * Device private folios do not support deferred splitting and
+ * shrinker based scanning of the folios to free.
*/
if (partially_mapped && folio_test_anon(folio) &&
- !folio_test_partially_mapped(folio))
+ !folio_test_partially_mapped(folio) &&
+ !folio_is_device_private(folio))
deferred_split_folio(folio, true);
__folio_mod_stat(folio, -nr, -nr_pmdmapped);
@@ -1982,7 +1969,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (likely(pte_present(pteval))) {
pfn = pte_pfn(pteval);
} else {
- pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ const softleaf_t entry = softleaf_from_pte(pteval);
+
+ pfn = softleaf_to_pfn(entry);
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
}
@@ -2339,6 +2328,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
while (page_vma_mapped_walk(&pvmw)) {
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
+ __maybe_unused unsigned long pfn;
+ __maybe_unused pmd_t pmdval;
+
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_locked(vma, pvmw.address,
pvmw.pmd, true);
@@ -2347,8 +2339,14 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- subpage = folio_page(folio,
- pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
+ pmdval = pmdp_get(pvmw.pmd);
+ if (likely(pmd_present(pmdval)))
+ pfn = pmd_pfn(pmdval);
+ else
+ pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval));
+
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
+
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
!folio_test_pmd_mappable(folio), folio);
@@ -2372,7 +2370,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (likely(pte_present(pteval))) {
pfn = pte_pfn(pteval);
} else {
- pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ const softleaf_t entry = softleaf_from_pte(pteval);
+
+ pfn = softleaf_to_pfn(entry);
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
}
@@ -2457,8 +2457,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
folio_mark_dirty(folio);
writable = pte_write(pteval);
} else {
+ const softleaf_t entry = softleaf_from_pte(pteval);
+
pte_clear(mm, address, pvmw.pte);
- writable = is_writable_device_private_entry(pte_to_swp_entry(pteval));
+
+ writable = softleaf_is_device_private_write(entry);
}
VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
@@ -2828,6 +2831,12 @@ static void rmap_walk_anon(struct folio *folio,
pgoff_t pgoff_start, pgoff_end;
struct anon_vma_chain *avc;
+ /*
+ * The folio lock ensures that folio->mapping can't be changed under us
+ * to an anon_vma with different root.
+ */
+ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
+
if (locked) {
anon_vma = folio_anon_vma(folio);
/* anon_vma disappear under us? */
diff --git a/mm/secretmem.c b/mm/secretmem.c
index f0ef4e198884..edf111e0a1bb 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -120,7 +120,7 @@ static int secretmem_release(struct inode *inode, struct file *file)
static int secretmem_mmap_prepare(struct vm_area_desc *desc)
{
- const unsigned long len = desc->end - desc->start;
+ const unsigned long len = vma_desc_size(desc);
if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
@@ -224,7 +224,7 @@ err_free_inode:
SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
{
- /* make sure local flags do not confict with global fcntl.h */
+ /* make sure local flags do not conflict with global fcntl.h */
BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
if (!secretmem_enable || !can_set_direct_map())
diff --git a/mm/shmem.c b/mm/shmem.c
index 899303d8c9aa..d578d8e765d7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,7 +66,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#include <linux/falloc.h>
#include <linux/splice.h>
#include <linux/security.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
#include <linux/ctype.h>
@@ -569,8 +569,37 @@ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */
-static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
-static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER;
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER)
+#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS)
+#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE)
+#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE)
+#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE
+#else
+#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
+#endif
+
+static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT;
+
+#undef SHMEM_HUGE_DEFAULT
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER)
+#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS)
+#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE)
+#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE)
+#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE
+#else
+#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
+#endif
+
+static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT;
+
+#undef TMPFS_HUGE_DEFAULT
static unsigned int shmem_get_orders_within_size(struct inode *inode,
unsigned long within_size_orders, pgoff_t index,
@@ -615,34 +644,23 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index
* the mTHP interface, so we still use PMD-sized huge order to
* check whether global control is enabled.
*
- * For tmpfs mmap()'s huge order, we still use PMD-sized order to
- * allocate huge pages due to lack of a write size hint.
- *
* For tmpfs with 'huge=always' or 'huge=within_size' mount option,
* we will always try PMD-sized order first. If that failed, it will
* fall back to small large folios.
*/
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
- if (vma)
- return maybe_pmd_order;
-
return THP_ORDERS_ALL_FILE_DEFAULT;
case SHMEM_HUGE_WITHIN_SIZE:
- if (vma)
- within_size_orders = maybe_pmd_order;
- else
- within_size_orders = THP_ORDERS_ALL_FILE_DEFAULT;
-
- within_size_orders = shmem_get_orders_within_size(inode, within_size_orders,
- index, write_end);
+ within_size_orders = shmem_get_orders_within_size(inode,
+ THP_ORDERS_ALL_FILE_DEFAULT, index, write_end);
if (within_size_orders > 0)
return within_size_orders;
fallthrough;
case SHMEM_HUGE_ADVISE:
if (vm_flags & VM_HUGEPAGE)
- return maybe_pmd_order;
+ return THP_ORDERS_ALL_FILE_DEFAULT;
fallthrough;
default:
return 0;
@@ -852,9 +870,9 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index
static void shmem_update_stats(struct folio *folio, int nr_pages)
{
if (folio_test_pmd_mappable(folio))
- __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
- __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
- __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
}
/*
@@ -1616,7 +1634,7 @@ try_split:
folio_mark_uptodate(folio);
}
- if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
+ if (!folio_alloc_swap(folio)) {
bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
int error;
@@ -2256,7 +2274,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
struct shmem_inode_info *info = SHMEM_I(inode);
- swp_entry_t swap, index_entry;
+ swp_entry_t swap;
+ softleaf_t index_entry;
struct swap_info_struct *si;
struct folio *folio = NULL;
bool skip_swapcache = false;
@@ -2268,7 +2287,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
swap = index_entry;
*foliop = NULL;
- if (is_poisoned_swp_entry(index_entry))
+ if (softleaf_is_poison_marker(index_entry))
return -EIO;
si = get_swap_device(index_entry);
@@ -2758,8 +2777,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
if (len > TASK_SIZE)
return -ENOMEM;
- addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
- flags);
+ addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags);
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return addr;
@@ -2837,8 +2855,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
if (inflated_len < len)
return addr;
- inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
- inflated_len, 0, flags);
+ inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags);
if (IS_ERR_VALUE(inflated_addr))
return addr;
if (inflated_addr & ~PAGE_MASK)
@@ -2926,16 +2943,17 @@ out_nomem:
return retval;
}
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+static int shmem_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
file_accessed(file);
/* This is anonymous shared memory if it is unlinked at the time of mmap */
if (inode->i_nlink)
- vma->vm_ops = &shmem_vm_ops;
+ desc->vm_ops = &shmem_vm_ops;
else
- vma->vm_ops = &shmem_anon_vm_ops;
+ desc->vm_ops = &shmem_anon_vm_ops;
return 0;
}
@@ -3860,12 +3878,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
- if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
- d_add(dentry, inode);
- else
- d_instantiate(dentry, inode);
-
- dget(dentry); /* Extra count - pin the dentry in core */
+ d_make_persistent(dentry, inode);
return error;
out_iput:
@@ -3926,7 +3939,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
- int ret = 0;
+ int ret;
/*
* No ordinary (disk based) filesystem counts links as inodes;
@@ -3938,29 +3951,19 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink) {
ret = shmem_reserve_inode(inode->i_sb, NULL);
if (ret)
- goto out;
+ return ret;
}
ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
if (ret) {
if (inode->i_nlink)
shmem_free_inode(inode->i_sb, 0);
- goto out;
+ return ret;
}
dir->i_size += BOGO_DIRENT_SIZE;
- inode_set_mtime_to_ts(dir,
- inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inode_inc_iversion(dir);
- inc_nlink(inode);
- ihold(inode); /* New dentry reference */
- dget(dentry); /* Extra pinning count for the created dentry */
- if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
- d_add(dentry, inode);
- else
- d_instantiate(dentry, inode);
-out:
- return ret;
+ return simple_link(old_dentry, dir, dentry);
}
static int shmem_unlink(struct inode *dir, struct dentry *dentry)
@@ -3973,11 +3976,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
dir->i_size -= BOGO_DIRENT_SIZE;
- inode_set_mtime_to_ts(dir,
- inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inode_inc_iversion(dir);
- drop_nlink(inode);
- dput(dentry); /* Undo the count from "create" - does all the work */
+ simple_unlink(dir, dentry);
/*
* For now, VFS can't deal with case-insensitive negative dentries, so
@@ -4132,11 +4132,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
dir->i_size += BOGO_DIRENT_SIZE;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
- if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
- d_add(dentry, inode);
- else
- d_instantiate(dentry, inode);
- dget(dentry);
+ d_make_persistent(dentry, inode);
return 0;
out_remove_offset:
@@ -5205,7 +5201,7 @@ static const struct address_space_operations shmem_aops = {
};
static const struct file_operations shmem_file_operations = {
- .mmap = shmem_mmap,
+ .mmap_prepare = shmem_mmap_prepare,
.open = shmem_file_open,
.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
@@ -5336,7 +5332,7 @@ static struct file_system_type shmem_fs_type = {
#ifdef CONFIG_TMPFS
.parameters = shmem_fs_parameters,
#endif
- .kill_sb = kill_litter_super,
+ .kill_sb = kill_anon_super,
.fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
};
@@ -5774,7 +5770,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(file, addr, len, pgoff, flags);
}
#endif
@@ -5880,14 +5876,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
}
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
-/**
- * shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap
- */
-int shmem_zero_setup(struct vm_area_struct *vma)
+static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags)
{
- struct file *file;
- loff_t size = vma->vm_end - vma->vm_start;
+ loff_t size = end - start;
/*
* Cloning a new file under mmap_lock leads to a lock ordering conflict
@@ -5895,7 +5886,18 @@ int shmem_zero_setup(struct vm_area_struct *vma)
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
*/
- file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
+ return shmem_kernel_file_setup("dev/zero", size, vm_flags);
+}
+
+/**
+ * shmem_zero_setup - setup a shared anonymous mapping
+ * @vma: the vma to be mmapped is prepared by do_mmap
+ * Returns: 0 on success, or error
+ */
+int shmem_zero_setup(struct vm_area_struct *vma)
+{
+ struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags);
+
if (IS_ERR(file))
return PTR_ERR(file);
@@ -5908,6 +5910,25 @@ int shmem_zero_setup(struct vm_area_struct *vma)
}
/**
+ * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA
+ * descriptor for convenience.
+ * @desc: Describes VMA
+ * Returns: 0 on success, or error
+ */
+int shmem_zero_setup_desc(struct vm_area_desc *desc)
+{
+ struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ desc->vm_file = file;
+ desc->vm_ops = &shmem_anon_vm_ops;
+
+ return 0;
+}
+
+/**
* shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
* @mapping: the folio's address_space
* @index: the folio index
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 84dfff4f7b1f..b613533b29e7 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -259,7 +259,7 @@ out:
* @object_size: The size of objects to be created in this cache.
* @args: Additional arguments for the cache creation (see
* &struct kmem_cache_args).
- * @flags: See the desriptions of individual flags. The common ones are listed
+ * @flags: See the descriptions of individual flags. The common ones are listed
* in the description below.
*
* Not to be called directly, use the kmem_cache_create() wrapper with the same
diff --git a/mm/slub.c b/mm/slub.c
index 2acce22590f8..e6a330e24145 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2530,7 +2530,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
memset((char *)kasan_reset_tag(x) + inuse, 0,
s->size - inuse - rsize);
/*
- * Restore orig_size, otherwize kmalloc redzone overwritten
+ * Restore orig_size, otherwise kmalloc redzone overwritten
* would be reported
*/
set_orig_size(s, x, orig_size);
@@ -7110,7 +7110,7 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
- * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
+ * GFP_NOWAIT and GFP_ATOMIC are supported, the __GFP_NORETRY modifier is not.
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
* preferable to the vmalloc fallback, due to visible performance drawbacks.
*
@@ -7119,6 +7119,7 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
gfp_t flags, int node)
{
+ bool allow_block;
void *ret;
/*
@@ -7131,10 +7132,6 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
if (ret || size <= PAGE_SIZE)
return ret;
- /* non-sleeping allocations are not supported by vmalloc */
- if (!gfpflags_allow_blocking(flags))
- return NULL;
-
/* Don't even allow crazy sizes */
if (unlikely(size > INT_MAX)) {
WARN_ON_ONCE(!(flags & __GFP_NOWARN));
@@ -7142,13 +7139,23 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
}
/*
+ * For non-blocking the VM_ALLOW_HUGE_VMAP is not used
+ * because the huge-mapping path in vmalloc contains at
+ * least one might_sleep() call.
+ *
+ * TODO: Revise huge-mapping path to support non-blocking
+ * flags.
+ */
+ allow_block = gfpflags_allow_blocking(flags);
+
+ /*
* kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
* since the callers already cannot assume anything
* about the resulting pointer, and cannot play
* protection games.
*/
return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
- flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ flags, PAGE_KERNEL, allow_block ? VM_ALLOW_HUGE_VMAP:0,
node, __builtin_return_address(0));
}
EXPORT_SYMBOL(__kvmalloc_node_noprof);
@@ -7899,11 +7906,11 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
* permitted to overwrite the first word of the object on
* kmem_cache_free.
*
- * This is the case if we do RCU, have a constructor or
- * destructor, are poisoning the objects, or are
- * redzoning an object smaller than sizeof(void *) or are
- * redzoning an object with slub_debug_orig_size() enabled,
- * in which case the right redzone may be extended.
+ * This is the case if we do RCU, have a constructor, are
+ * poisoning the objects, or are redzoning an object smaller
+ * than sizeof(void *) or are redzoning an object with
+ * slub_debug_orig_size() enabled, in which case the right
+ * redzone may be extended.
*
* The assumption that s->offset >= s->inuse means free
* pointer is outside of the object is used in the
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index dbd8daccade2..37522d6cb398 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -439,7 +439,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
return -ENOMEM;
pmd = pmd_offset(pud, addr);
- if (pmd_none(READ_ONCE(*pmd))) {
+ if (pmd_none(pmdp_get(pmd))) {
void *p;
p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
diff --git a/mm/swap.h b/mm/swap.h
index 8d8efdf1297a..d034c13d8dd2 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -445,25 +445,4 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
return 0;
}
#endif /* CONFIG_SWAP */
-
-/**
- * folio_index - File index of a folio.
- * @folio: The folio.
- *
- * For a folio which is either in the page cache or the swap cache,
- * return its index within the address_space it belongs to. If you know
- * the folio is definitely in the page cache, you can look at the folio's
- * index directly.
- *
- * Return: The index (offset in units of pages) of a folio in its file.
- */
-static inline pgoff_t folio_index(struct folio *folio)
-{
-#ifdef CONFIG_SWAP
- if (unlikely(folio_test_swapcache(folio)))
- return swp_offset(folio->swap);
-#endif
- return folio->index;
-}
-
#endif /* _MM_SWAP_H */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f4980dde5394..5f97c6ae70a2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -12,7 +12,7 @@
#include <linux/kernel_stat.h>
#include <linux/mempolicy.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
@@ -509,10 +509,6 @@ put_and_return:
* and reading the disk if it is not already cached.
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use.
- *
- * get/put_swap_device() aren't needed to call this function, because
- * __read_swap_cache_async() call them and swap_read_folio() holds the
- * swap cache folio lock.
*/
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
@@ -736,7 +732,6 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
pte_t *pte = NULL, pentry;
int win;
unsigned long start, end, addr;
- swp_entry_t entry;
pgoff_t ilx;
bool page_allocated;
@@ -749,6 +744,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
blk_start_plug(&plug);
for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) {
struct swap_info_struct *si = NULL;
+ softleaf_t entry;
if (!pte++) {
pte = pte_offset_map(vmf->pmd, addr);
@@ -756,10 +752,9 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
break;
}
pentry = ptep_get_lockless(pte);
- if (!is_swap_pte(pentry))
- continue;
- entry = pte_to_swp_entry(pentry);
- if (unlikely(non_swap_entry(entry)))
+ entry = softleaf_from_pte(pentry);
+
+ if (!softleaf_is_swap(entry))
continue;
pte_unmap(pte);
pte = NULL;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a1b4b9d80e3b..46d2008e4b99 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -44,7 +44,7 @@
#include <linux/plist.h>
#include <asm/tlbflush.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/swap_cgroup.h>
#include "swap_table.h"
#include "internal.h"
@@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages;
EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
-static int least_priority = -1;
+#define DEF_SWAP_PRIO -1
unsigned long swapfile_maximum_size;
#ifdef CONFIG_MIGRATION
bool swap_migration_ad_supported;
@@ -103,7 +103,7 @@ static PLIST_HEAD(swap_active_head);
* is held and the locking order requires swap_lock to be taken
* before any swap_info_struct->lock.
*/
-static struct plist_head *swap_avail_heads;
+static PLIST_HEAD(swap_avail_head);
static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -236,11 +236,10 @@ again:
ret = -nr_pages;
/*
- * When this function is called from scan_swap_map_slots() and it's
- * called by vmscan.c at reclaiming folios. So we hold a folio lock
- * here. We have to use trylock for avoiding deadlock. This is a special
- * case and you should use folio_free_swap() with explicit folio_lock()
- * in usual operations.
+ * We hold a folio lock here. We have to use trylock for
+ * avoiding deadlock. This is a special case and you should
+ * use folio_free_swap() with explicit folio_lock() in usual
+ * operations.
*/
if (!folio_trylock(folio))
goto out;
@@ -594,7 +593,7 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info
* this returns NULL for an non-empty list.
*/
static struct swap_cluster_info *isolate_lock_cluster(
- struct swap_info_struct *si, struct list_head *list, int order)
+ struct swap_info_struct *si, struct list_head *list)
{
struct swap_cluster_info *ci, *found = NULL;
@@ -751,14 +750,14 @@ static void relocate_cluster(struct swap_info_struct *si,
}
/*
- * The cluster corresponding to page_nr will be used. The cluster will not be
- * added to free cluster list and its usage counter will be increased by 1.
- * Only used for initialization.
+ * The cluster corresponding to @offset will be accounted as having one bad
+ * slot. The cluster will not be added to the free cluster list, and its
+ * usage counter will be increased by 1. Only used for initialization.
*/
-static int inc_cluster_info_page(struct swap_info_struct *si,
- struct swap_cluster_info *cluster_info, unsigned long page_nr)
+static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info,
+ unsigned long offset)
{
- unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+ unsigned long idx = offset / SWAPFILE_CLUSTER;
struct swap_table *table;
struct swap_cluster_info *ci;
@@ -772,8 +771,8 @@ static int inc_cluster_info_page(struct swap_info_struct *si,
ci->count++;
- VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
- VM_BUG_ON(ci->flags);
+ WARN_ON(ci->count > SWAPFILE_CLUSTER);
+ WARN_ON(ci->flags);
return 0;
}
@@ -957,7 +956,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
unsigned int found = SWAP_ENTRY_INVALID;
do {
- struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order);
+ struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
unsigned long offset;
if (!ci)
@@ -982,7 +981,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
if (force)
to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
- while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) {
+ while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
offset = cluster_offset(si, ci);
end = min(si->max, offset + SWAPFILE_CLUSTER);
to_scan--;
@@ -1101,13 +1100,6 @@ new_cluster:
goto done;
}
- /*
- * We don't have free cluster but have some clusters in discarding,
- * do discard now and reclaim them.
- */
- if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
- goto new_cluster;
-
if (order)
goto done;
@@ -1137,7 +1129,6 @@ done:
/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
{
- int nid;
unsigned long pages;
spin_lock(&swap_avail_lock);
@@ -1166,8 +1157,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
goto skip;
}
- for_each_node(nid)
- plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
+ plist_del(&si->avail_list, &swap_avail_head);
skip:
spin_unlock(&swap_avail_lock);
@@ -1176,7 +1166,6 @@ skip:
/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
{
- int nid;
long val;
unsigned long pages;
@@ -1209,8 +1198,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
goto skip;
}
- for_each_node(nid)
- plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
+ plist_add(&si->avail_list, &swap_avail_head);
skip:
spin_unlock(&swap_avail_lock);
@@ -1350,54 +1338,79 @@ static bool swap_alloc_fast(swp_entry_t *entry,
}
/* Rotate the device and switch to a new cluster */
-static bool swap_alloc_slow(swp_entry_t *entry,
+static void swap_alloc_slow(swp_entry_t *entry,
int order)
{
- int node;
unsigned long offset;
struct swap_info_struct *si, *next;
- node = numa_node_id();
spin_lock(&swap_avail_lock);
start_over:
- plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
+ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
/* Rotate the device and switch to a new cluster */
- plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
+ plist_requeue(&si->avail_list, &swap_avail_head);
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
put_swap_device(si);
if (offset) {
*entry = swp_entry(si->type, offset);
- return true;
+ return;
}
if (order)
- return false;
+ return;
}
spin_lock(&swap_avail_lock);
/*
* if we got here, it's likely that si was almost full before,
- * and since scan_swap_map_slots() can drop the si->lock,
* multiple callers probably all tried to get a page from the
* same si and it filled up before we could get one; or, the si
- * filled up between us dropping swap_avail_lock and taking
- * si->lock. Since we dropped the swap_avail_lock, the
- * swap_avail_head list may have been modified; so if next is
- * still in the swap_avail_head list then try it, otherwise
- * start over if we have not gotten any slots.
+ * filled up between us dropping swap_avail_lock.
+ * Since we dropped the swap_avail_lock, the swap_avail_list
+ * may have been modified; so if next is still in the
+ * swap_avail_head list then try it, otherwise start over if we
+ * have not gotten any slots.
*/
- if (plist_node_empty(&next->avail_lists[node]))
+ if (plist_node_empty(&next->avail_list))
goto start_over;
}
spin_unlock(&swap_avail_lock);
+}
+
+/*
+ * Discard pending clusters in a synchronized way when under high pressure.
+ * Return: true if any cluster is discarded.
+ */
+static bool swap_sync_discard(void)
+{
+ bool ret = false;
+ struct swap_info_struct *si, *next;
+
+ spin_lock(&swap_lock);
+start_over:
+ plist_for_each_entry_safe(si, next, &swap_active_head, list) {
+ spin_unlock(&swap_lock);
+ if (get_swap_device_info(si)) {
+ if (si->flags & SWP_PAGE_DISCARD)
+ ret = swap_do_scheduled_discard(si);
+ put_swap_device(si);
+ }
+ if (ret)
+ return true;
+
+ spin_lock(&swap_lock);
+ if (plist_node_empty(&next->list))
+ goto start_over;
+ }
+ spin_unlock(&swap_lock);
+
return false;
}
/**
* folio_alloc_swap - allocate swap space for a folio
* @folio: folio we want to move to swap
- * @gfp: gfp mask for shadow nodes
*
* Allocate swap space for the folio and add the folio to the
* swap cache.
@@ -1405,7 +1418,7 @@ start_over:
* Context: Caller needs to hold the folio lock.
* Return: Whether the folio was added to the swap cache.
*/
-int folio_alloc_swap(struct folio *folio, gfp_t gfp)
+int folio_alloc_swap(struct folio *folio)
{
unsigned int order = folio_order(folio);
unsigned int size = 1 << order;
@@ -1432,11 +1445,17 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
}
}
+again:
local_lock(&percpu_swap_cluster.lock);
if (!swap_alloc_fast(&entry, order))
swap_alloc_slow(&entry, order);
local_unlock(&percpu_swap_cluster.lock);
+ if (unlikely(!order && !entry.val)) {
+ if (swap_sync_discard())
+ goto again;
+ }
+
/* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
if (mem_cgroup_try_charge_swap(folio, entry))
goto out_free;
@@ -1677,7 +1696,7 @@ static bool swap_entries_put_map_nr(struct swap_info_struct *si,
/*
* Check if it's the last ref of swap entry in the freeing path.
- * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
+ * Qualified value includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
*/
static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
{
@@ -2239,7 +2258,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
struct folio *folio;
unsigned long offset;
unsigned char swp_count;
- swp_entry_t entry;
+ softleaf_t entry;
int ret;
pte_t ptent;
@@ -2250,11 +2269,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
ptent = ptep_get_lockless(pte);
+ entry = softleaf_from_pte(ptent);
- if (!is_swap_pte(ptent))
+ if (!softleaf_is_swap(entry))
continue;
-
- entry = pte_to_swp_entry(ptent);
if (swp_type(entry) != type)
continue;
@@ -2682,44 +2700,18 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
return generic_swapfile_activate(sis, swap_file, span);
}
-static int swap_node(struct swap_info_struct *si)
-{
- struct block_device *bdev;
-
- if (si->bdev)
- bdev = si->bdev;
- else
- bdev = si->swap_file->f_inode->i_sb->s_bdev;
-
- return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
-}
-
static void setup_swap_info(struct swap_info_struct *si, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info,
unsigned long *zeromap)
{
- int i;
-
- if (prio >= 0)
- si->prio = prio;
- else
- si->prio = --least_priority;
+ si->prio = prio;
/*
* the plist prio is negated because plist ordering is
* low-to-high, while swap ordering is high-to-low
*/
si->list.prio = -si->prio;
- for_each_node(i) {
- if (si->prio >= 0)
- si->avail_lists[i].prio = -si->prio;
- else {
- if (swap_node(si) == i)
- si->avail_lists[i].prio = 1;
- else
- si->avail_lists[i].prio = -si->prio;
- }
- }
+ si->avail_list.prio = -si->prio;
si->swap_map = swap_map;
si->cluster_info = cluster_info;
si->zeromap = zeromap;
@@ -2731,16 +2723,7 @@ static void _enable_swap_info(struct swap_info_struct *si)
total_swap_pages += si->pages;
assert_spin_locked(&swap_lock);
- /*
- * both lists are plists, and thus priority ordered.
- * swap_active_head needs to be priority ordered for swapoff(),
- * which on removal of any swap_info_struct with an auto-assigned
- * (i.e. negative) priority increments the auto-assigned priority
- * of any lower-priority swap_info_structs.
- * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
- * which allocates swap pages from the highest available priority
- * swap_info_struct.
- */
+
plist_add(&si->list, &swap_active_head);
/* Add back to available list */
@@ -2890,20 +2873,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
}
spin_lock(&p->lock);
del_from_avail_list(p, true);
- if (p->prio < 0) {
- struct swap_info_struct *si = p;
- int nid;
-
- plist_for_each_entry_continue(si, &swap_active_head, list) {
- si->prio++;
- si->list.prio--;
- for_each_node(nid) {
- if (si->avail_lists[nid].prio != 1)
- si->avail_lists[nid].prio--;
- }
- }
- least_priority++;
- }
plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
@@ -2942,7 +2911,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
- if (!p->bdev || !bdev_nonrot(p->bdev))
+ if (!(p->flags & SWP_SOLIDSTATE))
atomic_dec(&nr_rotate_swap);
mutex_lock(&swapon_mutex);
@@ -3141,9 +3110,8 @@ static struct swap_info_struct *alloc_swap_info(void)
struct swap_info_struct *p;
struct swap_info_struct *defer = NULL;
unsigned int type;
- int i;
- p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
+ p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -3182,8 +3150,7 @@ static struct swap_info_struct *alloc_swap_info(void)
}
p->swap_extent_root = RB_ROOT;
plist_node_init(&p->list, 0);
- for_each_node(i)
- plist_node_init(&p->avail_lists[i], 0);
+ plist_node_init(&p->avail_list, 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
if (defer) {
@@ -3236,8 +3203,17 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
*/
unsigned long generic_max_swapfile_size(void)
{
- return swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ swp_entry_t entry = swp_entry(0, ~0UL);
+ const pte_t pte = softleaf_to_pte(entry);
+
+ /*
+ * Since the PTE can be an invalid softleaf entry (e.g. the none PTE),
+ * we need to do this manually.
+ */
+ entry = __pte_to_swp_entry(pte);
+ entry = swp_entry(__swp_type(entry), __swp_offset(entry));
+
+ return swp_offset(entry) + 1;
}
/* Can be overridden by an architecture for additional checks. */
@@ -3355,7 +3331,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
si->global_cluster = kmalloc(sizeof(*si->global_cluster),
GFP_KERNEL);
if (!si->global_cluster)
- goto err_free;
+ goto err;
for (i = 0; i < SWAP_NR_ORDERS; i++)
si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
spin_lock_init(&si->global_cluster_lock);
@@ -3368,7 +3344,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
* See setup_swap_map(): header page, bad pages,
* and the EOF part of the last cluster.
*/
- err = inc_cluster_info_page(si, cluster_info, 0);
+ err = swap_cluster_setup_bad_slot(cluster_info, 0);
if (err)
goto err;
for (i = 0; i < swap_header->info.nr_badpages; i++) {
@@ -3376,12 +3352,12 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
if (page_nr >= maxpages)
continue;
- err = inc_cluster_info_page(si, cluster_info, page_nr);
+ err = swap_cluster_setup_bad_slot(cluster_info, page_nr);
if (err)
goto err;
}
for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
- err = inc_cluster_info_page(si, cluster_info, i);
+ err = swap_cluster_setup_bad_slot(cluster_info, i);
if (err)
goto err;
}
@@ -3408,9 +3384,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
}
return cluster_info;
-err_free:
- free_cluster_info(cluster_info, maxpages);
err:
+ free_cluster_info(cluster_info, maxpages);
return ERR_PTR(err);
}
@@ -3440,9 +3415,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!swap_avail_heads)
- return -ENOMEM;
-
si = alloc_swap_info();
if (IS_ERR(si))
return PTR_ERR(si);
@@ -3619,7 +3591,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
mutex_lock(&swapon_mutex);
- prio = -1;
+ prio = DEF_SWAP_PRIO;
if (swap_flags & SWAP_FLAG_PREFER)
prio = swap_flags & SWAP_FLAG_PRIO_MASK;
enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
@@ -4051,8 +4023,7 @@ static bool __has_usable_swap(void)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
- struct swap_info_struct *si, *next;
- int nid = folio_nid(folio);
+ struct swap_info_struct *si;
if (!(gfp & __GFP_IO))
return;
@@ -4071,8 +4042,7 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
return;
spin_lock(&swap_avail_lock);
- plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
- avail_lists[nid]) {
+ plist_for_each_entry(si, &swap_avail_head, avail_list) {
if (si->bdev) {
blkcg_schedule_throttle(si->bdev->bd_disk, true);
break;
@@ -4084,18 +4054,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
static int __init swapfile_init(void)
{
- int nid;
-
- swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
- GFP_KERNEL);
- if (!swap_avail_heads) {
- pr_emerg("Not enough memory for swap heads, swap is disabled\n");
- return -ENOMEM;
- }
-
- for_each_node(nid)
- plist_head_init(&swap_avail_heads[nid]);
-
swapfile_maximum_size = arch_max_swapfile_size();
/*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index af61b95c89e4..e6dfd5f28acd 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -10,7 +10,7 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
@@ -178,6 +178,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
spinlock_t *ptl;
struct folio *folio = page_folio(page);
bool page_in_cache = folio_mapping(folio);
+ pte_t dst_ptep;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
_dst_pte = pte_mkdirty(_dst_pte);
@@ -199,12 +200,15 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
}
ret = -EEXIST;
+
+ dst_ptep = ptep_get(dst_pte);
+
/*
- * We allow to overwrite a pte marker: consider when both MISSING|WP
- * registered, we firstly wr-protect a none pte which has no page cache
- * page backing it, then access the page.
+ * We are allowed to overwrite a UFFD pte marker: consider when both
+ * MISSING|WP registered, we firstly wr-protect a none pte which has no
+ * page cache page backing it, then access the page.
*/
- if (!pte_none_mostly(ptep_get(dst_pte)))
+ if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep))
goto out_unlock;
if (page_in_cache) {
@@ -583,12 +587,15 @@ retry:
goto out_unlock;
}
- if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
- !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
- err = -EEXIST;
- hugetlb_vma_unlock_read(dst_vma);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- goto out_unlock;
+ if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
+ const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte);
+
+ if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) {
+ err = -EEXIST;
+ hugetlb_vma_unlock_read(dst_vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out_unlock;
+ }
}
err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
@@ -1035,8 +1042,7 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
*/
static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
unsigned long src_addr,
- pte_t *src_pte, pte_t *dst_pte,
- struct anon_vma *src_anon_vma)
+ pte_t *src_pte, pte_t *dst_pte)
{
pte_t orig_dst_pte, orig_src_pte;
struct folio *folio;
@@ -1052,8 +1058,7 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
if (!folio || !folio_trylock(folio))
return NULL;
- if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) ||
- folio_anon_vma(folio) != src_anon_vma) {
+ if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) {
folio_unlock(folio);
return NULL;
}
@@ -1061,9 +1066,8 @@ static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
}
/*
- * Moves src folios to dst in a batch as long as they share the same
- * anon_vma as the first folio, are not large, and can successfully
- * take the lock via folio_trylock().
+ * Moves src folios to dst in a batch as long as they are not large, and can
+ * successfully take the lock via folio_trylock().
*/
static long move_present_ptes(struct mm_struct *mm,
struct vm_area_struct *dst_vma,
@@ -1073,8 +1077,7 @@ static long move_present_ptes(struct mm_struct *mm,
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
spinlock_t *dst_ptl, spinlock_t *src_ptl,
- struct folio **first_src_folio, unsigned long len,
- struct anon_vma *src_anon_vma)
+ struct folio **first_src_folio, unsigned long len)
{
int err = 0;
struct folio *src_folio = *first_src_folio;
@@ -1116,9 +1119,8 @@ static long move_present_ptes(struct mm_struct *mm,
orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
/* Set soft dirty bit so userspace can notice the pte was moved */
-#ifdef CONFIG_MEM_SOFT_DIRTY
- orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
-#endif
+ if (pgtable_supports_soft_dirty())
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
if (pte_dirty(orig_src_pte))
orig_dst_pte = pte_mkdirty(orig_dst_pte);
orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
@@ -1132,8 +1134,8 @@ static long move_present_ptes(struct mm_struct *mm,
src_pte++;
folio_unlock(src_folio);
- src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte,
- dst_pte, src_anon_vma);
+ src_folio = check_ptes_for_batched_move(src_vma, src_addr,
+ src_pte, dst_pte);
if (!src_folio)
break;
}
@@ -1205,9 +1207,8 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
-#ifdef CONFIG_MEM_SOFT_DIRTY
- orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
-#endif
+ if (pgtable_supports_soft_dirty())
+ orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1253,7 +1254,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd
unsigned long dst_addr, unsigned long src_addr,
unsigned long len, __u64 mode)
{
- swp_entry_t entry;
struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
@@ -1263,7 +1263,6 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd
pmd_t dummy_pmdval;
pmd_t dst_pmdval;
struct folio *src_folio = NULL;
- struct anon_vma *src_anon_vma = NULL;
struct mmu_notifier_range range;
long ret = 0;
@@ -1347,9 +1346,9 @@ retry:
}
/*
- * Pin and lock both source folio and anon_vma. Since we are in
- * RCU read section, we can't block, so on contention have to
- * unmap the ptes, obtain the lock and retry.
+ * Pin and lock source folio. Since we are in RCU read section,
+ * we can't block, so on contention have to unmap the ptes,
+ * obtain the lock and retry.
*/
if (!src_folio) {
struct folio *folio;
@@ -1423,46 +1422,25 @@ retry:
goto retry;
}
- if (!src_anon_vma) {
- /*
- * folio_referenced walks the anon_vma chain
- * without the folio lock. Serialize against it with
- * the anon_vma lock, the folio lock is not enough.
- */
- src_anon_vma = folio_get_anon_vma(src_folio);
- if (!src_anon_vma) {
- /* page was unmapped from under us */
- ret = -EAGAIN;
- goto out;
- }
- if (!anon_vma_trylock_write(src_anon_vma)) {
- pte_unmap(src_pte);
- pte_unmap(dst_pte);
- src_pte = dst_pte = NULL;
- /* now we can block and wait */
- anon_vma_lock_write(src_anon_vma);
- goto retry;
- }
- }
-
ret = move_present_ptes(mm, dst_vma, src_vma,
dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd,
dst_pmdval, dst_ptl, src_ptl, &src_folio,
- len, src_anon_vma);
- } else {
+ len);
+ } else { /* !pte_present() */
struct folio *folio = NULL;
+ const softleaf_t entry = softleaf_from_pte(orig_src_pte);
- entry = pte_to_swp_entry(orig_src_pte);
- if (non_swap_entry(entry)) {
- if (is_migration_entry(entry)) {
- pte_unmap(src_pte);
- pte_unmap(dst_pte);
- src_pte = dst_pte = NULL;
- migration_entry_wait(mm, src_pmd, src_addr);
- ret = -EAGAIN;
- } else
- ret = -EFAULT;
+ if (softleaf_is_migration(entry)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ migration_entry_wait(mm, src_pmd, src_addr);
+
+ ret = -EAGAIN;
+ goto out;
+ } else if (!softleaf_is_swap(entry)) {
+ ret = -EFAULT;
goto out;
}
@@ -1515,10 +1493,6 @@ retry:
}
out:
- if (src_anon_vma) {
- anon_vma_unlock_write(src_anon_vma);
- put_anon_vma(src_anon_vma);
- }
if (src_folio) {
folio_unlock(src_folio);
folio_put(src_folio);
@@ -1578,7 +1552,7 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
/*
* For now, we keep it simple and only move between writable VMAs.
- * Access flags are equal, therefore cheching only the source is enough.
+ * Access flags are equal, therefore checking only the source is enough.
*/
if (!(src_vma->vm_flags & VM_WRITE))
return -EINVAL;
@@ -1792,15 +1766,6 @@ static void uffd_move_unlock(struct vm_area_struct *dst_vma,
* virtual regions without knowing if there are transparent hugepage
* in the regions or not, but preventing the risk of having to split
* the hugepmd during the remap.
- *
- * If there's any rmap walk that is taking the anon_vma locks without
- * first obtaining the folio lock (the only current instance is
- * folio_referenced), they will have to verify if the folio->mapping
- * has changed after taking the anon_vma lock. If it changed they
- * should release the lock and retry obtaining a new anon_vma, because
- * it means the anon_vma was changed by move_pages() before the lock
- * could be obtained. This is the only additional complexity added to
- * the rmap code to provide this anonymous page remapping functionality.
*/
ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len, __u64 mode)
diff --git a/mm/util.c b/mm/util.c
index 8989d5767528..97cae40c0209 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1135,7 +1135,7 @@ EXPORT_SYMBOL(flush_dcache_folio);
#endif
/**
- * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare()
+ * __compat_vma_mmap() - See description for compat_vma_mmap()
* for details. This is the same operation, only with a specific file operations
* struct which may or may not be the same as vma->vm_file->f_op.
* @f_op: The file operations whose .mmap_prepare() hook is specified.
@@ -1143,7 +1143,7 @@ EXPORT_SYMBOL(flush_dcache_folio);
* @vma: The VMA to apply the .mmap_prepare() hook to.
* Returns: 0 on success or error.
*/
-int __compat_vma_mmap_prepare(const struct file_operations *f_op,
+int __compat_vma_mmap(const struct file_operations *f_op,
struct file *file, struct vm_area_struct *vma)
{
struct vm_area_desc desc = {
@@ -1156,21 +1156,24 @@ int __compat_vma_mmap_prepare(const struct file_operations *f_op,
.vm_file = vma->vm_file,
.vm_flags = vma->vm_flags,
.page_prot = vma->vm_page_prot,
+
+ .action.type = MMAP_NOTHING, /* Default */
};
int err;
err = f_op->mmap_prepare(&desc);
if (err)
return err;
- set_vma_from_desc(vma, &desc);
- return 0;
+ mmap_action_prepare(&desc.action, &desc);
+ set_vma_from_desc(vma, &desc);
+ return mmap_action_complete(&desc.action, vma);
}
-EXPORT_SYMBOL(__compat_vma_mmap_prepare);
+EXPORT_SYMBOL(__compat_vma_mmap);
/**
- * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an
- * existing VMA.
+ * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
+ * existing VMA and execute any requested actions.
* @file: The file which possesss an f_op->mmap_prepare() hook.
* @vma: The VMA to apply the .mmap_prepare() hook to.
*
@@ -1185,7 +1188,7 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare);
* .mmap_prepare() hook, as we are in a different context when we invoke the
* .mmap() hook, already having a VMA to deal with.
*
- * compat_vma_mmap_prepare() is a compatibility function that takes VMA state,
+ * compat_vma_mmap() is a compatibility function that takes VMA state,
* establishes a struct vm_area_desc descriptor, passes to the underlying
* .mmap_prepare() hook and applies any changes performed by it.
*
@@ -1194,11 +1197,11 @@ EXPORT_SYMBOL(__compat_vma_mmap_prepare);
*
* Returns: 0 on success or error.
*/
-int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
+int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
{
- return __compat_vma_mmap_prepare(file->f_op, file, vma);
+ return __compat_vma_mmap(file->f_op, file, vma);
}
-EXPORT_SYMBOL(compat_vma_mmap_prepare);
+EXPORT_SYMBOL(compat_vma_mmap);
static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
const struct page *page)
@@ -1280,6 +1283,127 @@ again:
}
}
+static int mmap_action_finish(struct mmap_action *action,
+ const struct vm_area_struct *vma, int err)
+{
+ /*
+ * If an error occurs, unmap the VMA altogether and return an error. We
+ * only clear the newly allocated VMA, since this function is only
+ * invoked if we do NOT merge, so we only clean up the VMA we created.
+ */
+ if (err) {
+ const size_t len = vma_pages(vma) << PAGE_SHIFT;
+
+ do_munmap(current->mm, vma->vm_start, len, NULL);
+
+ if (action->error_hook) {
+ /* We may want to filter the error. */
+ err = action->error_hook(err);
+
+ /* The caller should not clear the error. */
+ VM_WARN_ON_ONCE(!err);
+ }
+ return err;
+ }
+
+ if (action->success_hook)
+ return action->success_hook(vma);
+
+ return 0;
+}
+
+#ifdef CONFIG_MMU
+/**
+ * mmap_action_prepare - Perform preparatory setup for an VMA descriptor
+ * action which need to be performed.
+ * @desc: The VMA descriptor to prepare for @action.
+ * @action: The action to perform.
+ */
+void mmap_action_prepare(struct mmap_action *action,
+ struct vm_area_desc *desc)
+{
+ switch (action->type) {
+ case MMAP_NOTHING:
+ break;
+ case MMAP_REMAP_PFN:
+ remap_pfn_range_prepare(desc, action->remap.start_pfn);
+ break;
+ case MMAP_IO_REMAP_PFN:
+ io_remap_pfn_range_prepare(desc, action->remap.start_pfn,
+ action->remap.size);
+ break;
+ }
+}
+EXPORT_SYMBOL(mmap_action_prepare);
+
+/**
+ * mmap_action_complete - Execute VMA descriptor action.
+ * @action: The action to perform.
+ * @vma: The VMA to perform the action upon.
+ *
+ * Similar to mmap_action_prepare().
+ *
+ * Return: 0 on success, or error, at which point the VMA will be unmapped.
+ */
+int mmap_action_complete(struct mmap_action *action,
+ struct vm_area_struct *vma)
+{
+ int err = 0;
+
+ switch (action->type) {
+ case MMAP_NOTHING:
+ break;
+ case MMAP_REMAP_PFN:
+ err = remap_pfn_range_complete(vma, action->remap.start,
+ action->remap.start_pfn, action->remap.size,
+ action->remap.pgprot);
+ break;
+ case MMAP_IO_REMAP_PFN:
+ err = io_remap_pfn_range_complete(vma, action->remap.start,
+ action->remap.start_pfn, action->remap.size,
+ action->remap.pgprot);
+ break;
+ }
+
+ return mmap_action_finish(action, vma, err);
+}
+EXPORT_SYMBOL(mmap_action_complete);
+#else
+void mmap_action_prepare(struct mmap_action *action,
+ struct vm_area_desc *desc)
+{
+ switch (action->type) {
+ case MMAP_NOTHING:
+ break;
+ case MMAP_REMAP_PFN:
+ case MMAP_IO_REMAP_PFN:
+ WARN_ON_ONCE(1); /* nommu cannot handle these. */
+ break;
+ }
+}
+EXPORT_SYMBOL(mmap_action_prepare);
+
+int mmap_action_complete(struct mmap_action *action,
+ struct vm_area_struct *vma)
+{
+ int err = 0;
+
+ switch (action->type) {
+ case MMAP_NOTHING:
+ break;
+ case MMAP_REMAP_PFN:
+ case MMAP_IO_REMAP_PFN:
+ WARN_ON_ONCE(1); /* nommu cannot handle this. */
+
+ err = -EINVAL;
+ break;
+ }
+
+ return mmap_action_finish(action, vma, err);
+}
+EXPORT_SYMBOL(mmap_action_complete);
+#endif
+
#ifdef CONFIG_MMU
/**
* folio_pte_batch - detect a PTE batch for a large folio
diff --git a/mm/vma.c b/mm/vma.c
index abe0da33c844..fc90befd162f 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -34,7 +34,9 @@ struct mmap_state {
struct maple_tree mt_detach;
/* Determine if we can check KSM flags early in mmap() logic. */
- bool check_ksm_early;
+ bool check_ksm_early :1;
+ /* If we map new, hold the file rmap lock on mapping. */
+ bool hold_file_rmap_lock :1;
};
#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
@@ -87,15 +89,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
if (!mpol_equal(vmg->policy, vma_policy(vma)))
return false;
- /*
- * VM_SOFTDIRTY should not prevent from VMA merging, if we
- * match the flags but dirty bit -- the caller should mark
- * merged VMA as dirty. If dirty bit won't be excluded from
- * comparison, we increase pressure on the memory system forcing
- * the kernel to generate new VMAs when old one could be
- * extended instead.
- */
- if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY)
+ if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_IGNORE_MERGE)
return false;
if (vma->vm_file != vmg->file)
return false;
@@ -109,7 +103,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
{
struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
- struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */
+ struct vm_area_struct *src = vmg->middle; /* existing merge case. */
struct anon_vma *tgt_anon = tgt->anon_vma;
struct anon_vma *src_anon = vmg->anon_vma;
@@ -481,8 +475,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
- /* mm_wr_locked = */ true);
+ unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end);
mas_set(mas, vma->vm_end);
free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING,
@@ -798,7 +791,7 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
* Returns: The merged VMA if merge succeeds, or NULL otherwise.
*
* ASSUMPTIONS:
- * - The caller must assign the VMA to be modifed to @vmg->middle.
+ * - The caller must assign the VMA to be modified to @vmg->middle.
* - The caller must have set @vmg->prev to the previous VMA, if there is one.
* - The caller must not set @vmg->next, as we determine this.
* - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
@@ -807,6 +800,7 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma)
static __must_check struct vm_area_struct *vma_merge_existing_range(
struct vma_merge_struct *vmg)
{
+ vm_flags_t sticky_flags = vmg->vm_flags & VM_STICKY;
struct vm_area_struct *middle = vmg->middle;
struct vm_area_struct *prev = vmg->prev;
struct vm_area_struct *next;
@@ -899,11 +893,13 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
if (merge_right) {
vma_start_write(next);
vmg->target = next;
+ sticky_flags |= (next->vm_flags & VM_STICKY);
}
if (merge_left) {
vma_start_write(prev);
vmg->target = prev;
+ sticky_flags |= (prev->vm_flags & VM_STICKY);
}
if (merge_both) {
@@ -973,6 +969,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
if (err || commit_merge(vmg))
goto abort;
+ vm_flags_set(vmg->target, sticky_flags);
khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
@@ -1123,6 +1120,10 @@ int vma_expand(struct vma_merge_struct *vmg)
bool remove_next = false;
struct vm_area_struct *target = vmg->target;
struct vm_area_struct *next = vmg->next;
+ vm_flags_t sticky_flags;
+
+ sticky_flags = vmg->vm_flags & VM_STICKY;
+ sticky_flags |= target->vm_flags & VM_STICKY;
VM_WARN_ON_VMG(!target, vmg);
@@ -1132,6 +1133,7 @@ int vma_expand(struct vma_merge_struct *vmg)
if (next && (target != next) && (vmg->end == next->vm_end)) {
int ret;
+ sticky_flags |= next->vm_flags & VM_STICKY;
remove_next = true;
/* This should already have been checked by this point. */
VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
@@ -1158,6 +1160,7 @@ int vma_expand(struct vma_merge_struct *vmg)
if (commit_merge(vmg))
goto nomem;
+ vm_flags_set(target, sticky_flags);
return 0;
nomem:
@@ -1226,7 +1229,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
tlb_gather_mmu(&tlb, vms->vma->vm_mm);
update_hiwater_rss(vms->vma->vm_mm);
unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
- vms->vma_count, mm_wr_locked);
+ vms->vma_count);
mas_set(mas_detach, 1);
/* start and end may be different if there is no prev or next vma. */
@@ -1637,25 +1640,35 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
return vma;
}
-struct vm_area_struct *vma_modify_flags(
- struct vma_iterator *vmi, struct vm_area_struct *prev,
- struct vm_area_struct *vma, unsigned long start, unsigned long end,
- vm_flags_t vm_flags)
+struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ vm_flags_t *vm_flags_ptr)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+ const vm_flags_t vm_flags = *vm_flags_ptr;
+ struct vm_area_struct *ret;
vmg.vm_flags = vm_flags;
- return vma_modify(&vmg);
+ ret = vma_modify(&vmg);
+ if (IS_ERR(ret))
+ return ret;
+
+ /*
+ * For a merge to succeed, the flags must match those
+ * requested. However, sticky flags may have been retained, so propagate
+ * them to the caller.
+ */
+ if (vmg.state == VMA_MERGE_SUCCESS)
+ *vm_flags_ptr = ret->vm_flags;
+ return ret;
}
-struct vm_area_struct
-*vma_modify_name(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- struct anon_vma_name *new_name)
+struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct anon_vma_name *new_name)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
@@ -1664,12 +1677,10 @@ struct vm_area_struct
return vma_modify(&vmg);
}
-struct vm_area_struct
-*vma_modify_policy(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct mempolicy *new_pol)
+struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct mempolicy *new_pol)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
@@ -1678,14 +1689,10 @@ struct vm_area_struct
return vma_modify(&vmg);
}
-struct vm_area_struct
-*vma_modify_flags_uffd(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- vm_flags_t vm_flags,
- struct vm_userfaultfd_ctx new_ctx,
- bool give_up_on_oom)
+struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, vm_flags_t vm_flags,
+ struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
@@ -1754,24 +1761,7 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
unlink_file_vma_batch_process(vb);
}
-/*
- * Unlink a file-based vm structure from its interval tree, to hide
- * vma from rmap and vmtruncate before freeing its page tables.
- */
-void unlink_file_vma(struct vm_area_struct *vma)
-{
- struct file *file = vma->vm_file;
-
- if (file) {
- struct address_space *mapping = file->f_mapping;
-
- i_mmap_lock_write(mapping);
- __remove_shared_vm_struct(vma, mapping);
- i_mmap_unlock_write(mapping);
- }
-}
-
-void vma_link_file(struct vm_area_struct *vma)
+static void vma_link_file(struct vm_area_struct *vma, bool hold_rmap_lock)
{
struct file *file = vma->vm_file;
struct address_space *mapping;
@@ -1780,11 +1770,12 @@ void vma_link_file(struct vm_area_struct *vma)
mapping = file->f_mapping;
i_mmap_lock_write(mapping);
__vma_link_file(vma, mapping);
- i_mmap_unlock_write(mapping);
+ if (!hold_rmap_lock)
+ i_mmap_unlock_write(mapping);
}
}
-int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
+static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
{
VMA_ITERATOR(vmi, mm, 0);
@@ -1794,7 +1785,7 @@ int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
vma_start_write(vma);
vma_iter_store_new(&vmi, vma);
- vma_link_file(vma);
+ vma_link_file(vma, /* hold_rmap_lock= */false);
mm->map_count++;
validate_mm(mm);
return 0;
@@ -1917,7 +1908,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
return a->vm_end == b->vm_start &&
mpol_equal(vma_policy(a), vma_policy(b)) &&
a->vm_file == b->vm_file &&
- !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_IGNORE_MERGE)) &&
b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}
@@ -2328,17 +2319,33 @@ static void update_ksm_flags(struct mmap_state *map)
map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags);
}
+static void set_desc_from_map(struct vm_area_desc *desc,
+ const struct mmap_state *map)
+{
+ desc->start = map->addr;
+ desc->end = map->end;
+
+ desc->pgoff = map->pgoff;
+ desc->vm_file = map->file;
+ desc->vm_flags = map->vm_flags;
+ desc->page_prot = map->page_prot;
+}
+
/*
- * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be
+ * __mmap_setup() - Prepare to gather any overlapping VMAs that need to be
* unmapped once the map operation is completed, check limits, account mapping
* and clean up any pre-existing VMAs.
*
+ * As a result it sets up the @map and @desc objects.
+ *
* @map: Mapping state.
+ * @desc: VMA descriptor
* @uf: Userfaultfd context list.
*
* Returns: 0 on success, error code otherwise.
*/
-static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
+static int __mmap_setup(struct mmap_state *map, struct vm_area_desc *desc,
+ struct list_head *uf)
{
int error;
struct vma_iterator *vmi = map->vmi;
@@ -2395,6 +2402,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
*/
vms_clean_up_area(vms, &map->mas_detach);
+ set_desc_from_map(desc, map);
return 0;
}
@@ -2496,7 +2504,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
vma_start_write(vma);
vma_iter_store_new(vmi, vma);
map->mm->map_count++;
- vma_link_file(vma);
+ vma_link_file(vma, map->hold_file_rmap_lock);
/*
* vma_merge_new_range() calls khugepaged_enter_vma() too, the below
@@ -2551,11 +2559,23 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
- vm_flags_set(vma, VM_SOFTDIRTY);
+ if (pgtable_supports_soft_dirty())
+ vm_flags_set(vma, VM_SOFTDIRTY);
vma_set_page_prot(vma);
}
+static void call_action_prepare(struct mmap_state *map,
+ struct vm_area_desc *desc)
+{
+ struct mmap_action *action = &desc->action;
+
+ mmap_action_prepare(action, desc);
+
+ if (action->hide_from_rmap_until_complete)
+ map->hold_file_rmap_lock = true;
+}
+
/*
* Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
* specifies it.
@@ -2567,34 +2587,26 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
*
* Returns 0 on success, or an error code otherwise.
*/
-static int call_mmap_prepare(struct mmap_state *map)
+static int call_mmap_prepare(struct mmap_state *map,
+ struct vm_area_desc *desc)
{
int err;
- struct vm_area_desc desc = {
- .mm = map->mm,
- .file = map->file,
- .start = map->addr,
- .end = map->end,
-
- .pgoff = map->pgoff,
- .vm_file = map->file,
- .vm_flags = map->vm_flags,
- .page_prot = map->page_prot,
- };
/* Invoke the hook. */
- err = vfs_mmap_prepare(map->file, &desc);
+ err = vfs_mmap_prepare(map->file, desc);
if (err)
return err;
+ call_action_prepare(map, desc);
+
/* Update fields permitted to be changed. */
- map->pgoff = desc.pgoff;
- map->file = desc.vm_file;
- map->vm_flags = desc.vm_flags;
- map->page_prot = desc.page_prot;
+ map->pgoff = desc->pgoff;
+ map->file = desc->vm_file;
+ map->vm_flags = desc->vm_flags;
+ map->page_prot = desc->page_prot;
/* User-defined fields. */
- map->vm_ops = desc.vm_ops;
- map->vm_private_data = desc.private_data;
+ map->vm_ops = desc->vm_ops;
+ map->vm_private_data = desc->private_data;
return 0;
}
@@ -2636,22 +2648,48 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
return false;
}
+static int call_action_complete(struct mmap_state *map,
+ struct vm_area_desc *desc,
+ struct vm_area_struct *vma)
+{
+ struct mmap_action *action = &desc->action;
+ int ret;
+
+ ret = mmap_action_complete(action, vma);
+
+ /* If we held the file rmap we need to release it. */
+ if (map->hold_file_rmap_lock) {
+ struct file *file = vma->vm_file;
+
+ i_mmap_unlock_write(file->f_mapping);
+ }
+ return ret;
+}
+
static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
- int error;
bool have_mmap_prepare = file && file->f_op->mmap_prepare;
VMA_ITERATOR(vmi, mm, addr);
MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
+ struct vm_area_desc desc = {
+ .mm = mm,
+ .file = file,
+ .action = {
+ .type = MMAP_NOTHING, /* Default to no further action. */
+ },
+ };
+ bool allocated_new = false;
+ int error;
map.check_ksm_early = can_set_ksm_flags_early(&map);
- error = __mmap_prepare(&map, uf);
+ error = __mmap_setup(&map, &desc, uf);
if (!error && have_mmap_prepare)
- error = call_mmap_prepare(&map);
+ error = call_mmap_prepare(&map, &desc);
if (error)
goto abort_munmap;
@@ -2670,6 +2708,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
error = __mmap_new_vma(&map, &vma);
if (error)
goto unacct_error;
+ allocated_new = true;
}
if (have_mmap_prepare)
@@ -2677,9 +2716,16 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
__mmap_complete(&map, vma);
+ if (have_mmap_prepare && allocated_new) {
+ error = call_action_complete(&map, &desc, vma);
+
+ if (error)
+ return error;
+ }
+
return addr;
- /* Accounting was done by __mmap_prepare(). */
+ /* Accounting was done by __mmap_setup(). */
unacct_error:
if (map.charged)
vm_unacct_memory(map.charged);
@@ -2819,7 +2865,8 @@ out:
mm->data_vm += len >> PAGE_SHIFT;
if (vm_flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
- vm_flags_set(vma, VM_SOFTDIRTY);
+ if (pgtable_supports_soft_dirty())
+ vm_flags_set(vma, VM_SOFTDIRTY);
return 0;
mas_store_fail:
diff --git a/mm/vma.h b/mm/vma.h
index 9183fe549009..abada6a64c4e 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -263,47 +263,113 @@ void remove_vma(struct vm_area_struct *vma);
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
-/* We are about to modify the VMA's flags. */
-__must_check struct vm_area_struct
-*vma_modify_flags(struct vma_iterator *vmi,
+/**
+ * vma_modify_flags() - Peform any necessary split/merge in preparation for
+ * setting VMA flags to *@vm_flags in the range @start to @end contained within
+ * @vma.
+ * @vmi: Valid VMA iterator positioned at @vma.
+ * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
+ * @vma: The VMA containing the range @start to @end to be updated.
+ * @start: The start of the range to update. May be offset within @vma.
+ * @end: The exclusive end of the range to update, may be offset within @vma.
+ * @vm_flags_ptr: A pointer to the VMA flags that the @start to @end range is
+ * about to be set to. On merge, this will be updated to include sticky flags.
+ *
+ * IMPORTANT: The actual modification being requested here is NOT applied,
+ * rather the VMA is perhaps split, perhaps merged to accommodate the change,
+ * and the caller is expected to perform the actual modification.
+ *
+ * In order to account for sticky VMA flags, the @vm_flags_ptr parameter points
+ * to the requested flags which are then updated so the caller, should they
+ * overwrite any existing flags, correctly retains these.
+ *
+ * Returns: A VMA which contains the range @start to @end ready to have its
+ * flags altered to *@vm_flags.
+ */
+__must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ vm_flags_t *vm_flags_ptr);
+
+/**
+ * vma_modify_name() - Peform any necessary split/merge in preparation for
+ * setting anonymous VMA name to @new_name in the range @start to @end contained
+ * within @vma.
+ * @vmi: Valid VMA iterator positioned at @vma.
+ * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
+ * @vma: The VMA containing the range @start to @end to be updated.
+ * @start: The start of the range to update. May be offset within @vma.
+ * @end: The exclusive end of the range to update, may be offset within @vma.
+ * @new_name: The anonymous VMA name that the @start to @end range is about to
+ * be set to.
+ *
+ * IMPORTANT: The actual modification being requested here is NOT applied,
+ * rather the VMA is perhaps split, perhaps merged to accommodate the change,
+ * and the caller is expected to perform the actual modification.
+ *
+ * Returns: A VMA which contains the range @start to @end ready to have its
+ * anonymous VMA name changed to @new_name.
+ */
+__must_check struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi,
struct vm_area_struct *prev, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- vm_flags_t vm_flags);
-
-/* We are about to modify the VMA's anon_name. */
-__must_check struct vm_area_struct
-*vma_modify_name(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- struct anon_vma_name *new_name);
-
-/* We are about to modify the VMA's memory policy. */
-__must_check struct vm_area_struct
-*vma_modify_policy(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
+ struct anon_vma_name *new_name);
+
+/**
+ * vma_modify_policy() - Peform any necessary split/merge in preparation for
+ * setting NUMA policy to @new_pol in the range @start to @end contained
+ * within @vma.
+ * @vmi: Valid VMA iterator positioned at @vma.
+ * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
+ * @vma: The VMA containing the range @start to @end to be updated.
+ * @start: The start of the range to update. May be offset within @vma.
+ * @end: The exclusive end of the range to update, may be offset within @vma.
+ * @new_pol: The NUMA policy that the @start to @end range is about to be set
+ * to.
+ *
+ * IMPORTANT: The actual modification being requested here is NOT applied,
+ * rather the VMA is perhaps split, perhaps merged to accommodate the change,
+ * and the caller is expected to perform the actual modification.
+ *
+ * Returns: A VMA which contains the range @start to @end ready to have its
+ * NUMA policy changed to @new_pol.
+ */
+__must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct mempolicy *new_pol);
-/* We are about to modify the VMA's flags and/or uffd context. */
-__must_check struct vm_area_struct
-*vma_modify_flags_uffd(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- vm_flags_t vm_flags,
- struct vm_userfaultfd_ctx new_ctx,
- bool give_up_on_oom);
+/**
+ * vma_modify_flags_uffd() - Peform any necessary split/merge in preparation for
+ * setting VMA flags to @vm_flags and UFFD context to @new_ctx in the range
+ * @start to @end contained within @vma.
+ * @vmi: Valid VMA iterator positioned at @vma.
+ * @prev: The VMA immediately prior to @vma or NULL if @vma is the first.
+ * @vma: The VMA containing the range @start to @end to be updated.
+ * @start: The start of the range to update. May be offset within @vma.
+ * @end: The exclusive end of the range to update, may be offset within @vma.
+ * @vm_flags: The VMA flags that the @start to @end range is about to be set to.
+ * @new_ctx: The userfaultfd context that the @start to @end range is about to
+ * be set to.
+ * @give_up_on_oom: If an out of memory condition occurs on merge, simply give
+ * up on it and treat the merge as best-effort.
+ *
+ * IMPORTANT: The actual modification being requested here is NOT applied,
+ * rather the VMA is perhaps split, perhaps merged to accommodate the change,
+ * and the caller is expected to perform the actual modification.
+ *
+ * Returns: A VMA which contains the range @start to @end ready to have its VMA
+ * flags changed to @vm_flags and its userfaultfd context changed to @new_ctx.
+ */
+__must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, vm_flags_t vm_flags,
+ struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom);
-__must_check struct vm_area_struct
-*vma_merge_new_range(struct vma_merge_struct *vmg);
+__must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg);
-__must_check struct vm_area_struct
-*vma_merge_extend(struct vma_iterator *vmi,
- struct vm_area_struct *vma,
- unsigned long delta);
+__must_check struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
+ struct vm_area_struct *vma, unsigned long delta);
void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);
@@ -312,12 +378,6 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb);
void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
struct vm_area_struct *vma);
-void unlink_file_vma(struct vm_area_struct *vma);
-
-void vma_link_file(struct vm_area_struct *vma);
-
-int vma_link(struct mm_struct *mm, struct vm_area_struct *vma);
-
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks);
diff --git a/mm/vma_exec.c b/mm/vma_exec.c
index 922ee51747a6..8134e1afca68 100644
--- a/mm/vma_exec.c
+++ b/mm/vma_exec.c
@@ -107,6 +107,7 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
unsigned long *top_mem_p)
{
+ unsigned long flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
int err;
struct vm_area_struct *vma = vm_area_alloc(mm);
@@ -137,7 +138,9 @@ int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
vma->vm_end = STACK_TOP_MAX;
vma->vm_start = vma->vm_end - PAGE_SIZE;
- vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
+ if (pgtable_supports_soft_dirty())
+ flags |= VM_SOFTDIRTY;
+ vm_flags_init(vma, flags);
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
err = insert_vm_struct(mm, vma);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 798b2ed21e46..ecbac900c35f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -100,6 +100,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct page *page;
unsigned long size = PAGE_SIZE;
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(end - addr)))
+ return -EINVAL;
+
pfn = phys_addr >> PAGE_SHIFT;
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
@@ -167,6 +170,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
{
pmd_t *pmd;
unsigned long next;
+ int err = 0;
pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
@@ -180,10 +184,11 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
continue;
}
- if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
- return -ENOMEM;
+ err = vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask);
+ if (err)
+ break;
} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
+ return err;
}
static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
@@ -217,6 +222,7 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
{
pud_t *pud;
unsigned long next;
+ int err = 0;
pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
@@ -230,11 +236,11 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
continue;
}
- if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
- max_page_shift, mask))
- return -ENOMEM;
+ err = vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask);
+ if (err)
+ break;
} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
+ return err;
}
static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
@@ -268,6 +274,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
{
p4d_t *p4d;
unsigned long next;
+ int err = 0;
p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
@@ -281,11 +288,11 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
continue;
}
- if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
- max_page_shift, mask))
- return -ENOMEM;
+ err = vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask);
+ if (err)
+ break;
} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
+ return err;
}
static int vmap_range_noflush(unsigned long addr, unsigned long end,
@@ -671,16 +678,28 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
}
int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
- pgprot_t prot, struct page **pages, unsigned int page_shift)
+ pgprot_t prot, struct page **pages, unsigned int page_shift,
+ gfp_t gfp_mask)
{
int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
- page_shift);
+ page_shift, gfp_mask);
if (ret)
return ret;
return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}
+static int __vmap_pages_range(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift,
+ gfp_t gfp_mask)
+{
+ int err;
+
+ err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask);
+ flush_cache_vmap(addr, end);
+ return err;
+}
+
/**
* vmap_pages_range - map pages to a kernel virtual address
* @addr: start of the VM area to map
@@ -696,11 +715,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
int vmap_pages_range(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
- int err;
-
- err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
- flush_cache_vmap(addr, end);
- return err;
+ return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL);
}
static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
@@ -2017,6 +2032,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long freed;
unsigned long addr;
unsigned int vn_id;
+ bool allow_block;
int purged = 0;
int ret;
@@ -2028,7 +2044,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
/* Only reclaim behaviour flags are relevant. */
gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
- might_sleep();
+ allow_block = gfpflags_allow_blocking(gfp_mask);
+ might_sleep_if(allow_block);
/*
* If a VA is obtained from a global heap(if it fails here)
@@ -2062,7 +2079,8 @@ retry:
* This is not a fast path. Check if yielding is needed. This
* is the only reschedule point in the vmalloc() path.
*/
- cond_resched();
+ if (allow_block)
+ cond_resched();
}
trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr));
@@ -2071,8 +2089,16 @@ retry:
* If an allocation fails, the error value is
* returned. Therefore trigger the overflow path.
*/
- if (IS_ERR_VALUE(addr))
- goto overflow;
+ if (IS_ERR_VALUE(addr)) {
+ if (allow_block)
+ goto overflow;
+
+ /*
+ * We can not trigger any reclaim logic because
+ * sleeping is not allowed, thus fail an allocation.
+ */
+ goto out_free_va;
+ }
va->va_start = addr;
va->va_end = addr + size;
@@ -2122,6 +2148,7 @@ overflow:
pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n",
size, vstart, vend);
+out_free_va:
kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
@@ -2672,8 +2699,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
node = numa_node_id();
- vb = kmalloc_node(sizeof(struct vmap_block),
- gfp_mask & GFP_RECLAIM_MASK, node);
+ vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask, node);
if (unlikely(!vb))
return ERR_PTR(-ENOMEM);
@@ -3587,13 +3613,58 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */
+/*
+ * Helper for vmalloc to adjust the gfp flags for certain allocations.
+ */
+static inline gfp_t vmalloc_gfp_adjust(gfp_t flags, const bool large)
+{
+ flags |= __GFP_NOWARN;
+ if (large)
+ flags &= ~__GFP_NOFAIL;
+ return flags;
+}
+
static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
+ unsigned int nr_remaining = nr_pages;
+ unsigned int max_attempt_order = MAX_PAGE_ORDER;
struct page *page;
int i;
+ unsigned int large_order = ilog2(nr_remaining);
+ gfp_t large_gfp = vmalloc_gfp_adjust(gfp, large_order) & ~__GFP_DIRECT_RECLAIM;
+
+ large_order = min(max_attempt_order, large_order);
+
+ /*
+ * Initially, attempt to have the page allocator give us large order
+ * pages. Do not attempt allocating smaller than order chunks since
+ * __vmap_pages_range() expects physically contigous pages of exactly
+ * order long chunks.
+ */
+ while (large_order > order && nr_remaining) {
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages_noprof(large_gfp, large_order);
+ else
+ page = alloc_pages_node_noprof(nid, large_gfp, large_order);
+
+ if (unlikely(!page)) {
+ max_attempt_order = --large_order;
+ continue;
+ }
+
+ split_page(page, large_order);
+ for (i = 0; i < (1U << large_order); i++)
+ pages[nr_allocated + i] = page + i;
+
+ nr_allocated += 1U << large_order;
+ nr_remaining = nr_pages - nr_allocated;
+
+ large_order = ilog2(nr_remaining);
+ large_order = min(max_attempt_order, large_order);
+ }
/*
* For order-0 pages we make use of bulk allocator, if
@@ -3675,6 +3746,71 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
return nr_allocated;
}
+static LLIST_HEAD(pending_vm_area_cleanup);
+static void cleanup_vm_area_work(struct work_struct *work)
+{
+ struct vm_struct *area, *tmp;
+ struct llist_node *head;
+
+ head = llist_del_all(&pending_vm_area_cleanup);
+ if (!head)
+ return;
+
+ llist_for_each_entry_safe(area, tmp, head, llnode) {
+ if (!area->pages)
+ free_vm_area(area);
+ else
+ vfree(area->addr);
+ }
+}
+
+/*
+ * Helper for __vmalloc_area_node() to defer cleanup
+ * of partially initialized vm_struct in error paths.
+ */
+static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work);
+static void defer_vm_area_cleanup(struct vm_struct *area)
+{
+ if (llist_add(&area->llnode, &pending_vm_area_cleanup))
+ schedule_work(&cleanup_vm_area);
+}
+
+/*
+ * Page tables allocations ignore external GFP. Enforces it by
+ * the memalloc scope API. It is used by vmalloc internals and
+ * KASAN shadow population only.
+ *
+ * GFP to scope mapping:
+ *
+ * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save()
+ * GFP_NOFS - memalloc_nofs_save()
+ * GFP_NOIO - memalloc_noio_save()
+ *
+ * Returns a flag cookie to pair with restore.
+ */
+unsigned int
+memalloc_apply_gfp_scope(gfp_t gfp_mask)
+{
+ unsigned int flags = 0;
+
+ if (!gfpflags_allow_blocking(gfp_mask))
+ flags = memalloc_noreclaim_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ flags = memalloc_nofs_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ flags = memalloc_noio_save();
+
+ /* 0 - no scope applied. */
+ return flags;
+}
+
+void
+memalloc_restore_scope(unsigned int flags)
+{
+ if (flags)
+ memalloc_flags_restore(flags);
+}
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, unsigned int page_shift,
int node)
@@ -3691,6 +3827,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
+ /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */
+ if (!gfpflags_allow_blocking(gfp_mask))
+ nofail = false;
+
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
@@ -3706,8 +3846,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
- free_vm_area(area);
- return NULL;
+ goto fail;
}
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
@@ -3721,9 +3860,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* Please note, the __vmalloc_node_range_noprof() falls-back
* to order-0 pages if high-order attempt is unsuccessful.
*/
- area->nr_pages = vm_area_alloc_pages((page_order ?
- gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN,
- node, page_order, nr_small_pages, area->pages);
+ area->nr_pages = vm_area_alloc_pages(
+ vmalloc_gfp_adjust(gfp_mask, page_order), node,
+ page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
/* All pages of vm should be charged to same memcg, so use first one. */
@@ -3757,22 +3896,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* page tables allocations ignore external gfp mask, enforce it
* by the scope API
*/
- if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
- flags = memalloc_nofs_save();
- else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
- flags = memalloc_noio_save();
-
+ flags = memalloc_apply_gfp_scope(gfp_mask);
do {
- ret = vmap_pages_range(addr, addr + size, prot, area->pages,
- page_shift);
+ ret = __vmap_pages_range(addr, addr + size, prot, area->pages,
+ page_shift, nested_gfp);
if (nofail && (ret < 0))
schedule_timeout_uninterruptible(1);
} while (nofail && (ret < 0));
-
- if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
- memalloc_nofs_restore(flags);
- else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
- memalloc_noio_restore(flags);
+ memalloc_restore_scope(flags);
if (ret < 0) {
warn_alloc(gfp_mask, NULL,
@@ -3784,10 +3915,32 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
- vfree(area->addr);
+ defer_vm_area_cleanup(area);
return NULL;
}
+/*
+ * See __vmalloc_node_range() for a clear list of supported vmalloc flags.
+ * This gfp lists all flags currently passed through vmalloc. Currently,
+ * __GFP_ZERO is used by BPF and __GFP_NORETRY is used by percpu. Both drm
+ * and BPF also use GFP_USER. Additionally, various users pass
+ * GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP.
+ */
+#define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\
+ __GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\
+ GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\
+ GFP_USER | __GFP_NOLOCKDEP)
+
+static gfp_t vmalloc_fix_flags(gfp_t flags)
+{
+ gfp_t invalid_mask = flags & ~GFP_VMALLOC_SUPPORTED;
+
+ flags &= GFP_VMALLOC_SUPPORTED;
+ WARN_ONCE(1, "Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
+ invalid_mask, &invalid_mask, flags, &flags);
+ return flags;
+}
+
/**
* __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size
@@ -3801,19 +3954,20 @@ fail:
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Please note that the full set of gfp
- * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
- * supported.
- * Zone modifiers are not supported. From the reclaim modifiers
- * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
- * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
- * __GFP_RETRY_MAYFAIL are not supported).
+ * allocator with @gfp_mask flags and map them into contiguous
+ * virtual range with protection @prot.
+ *
+ * Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT,
+ * %GFP_NOFS and %GFP_NOIO. Zone modifiers are not supported.
+ * Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only
+ * by __vmalloc().
*
- * __GFP_NOWARN can be used to suppress failures messages.
+ * Retry modifiers: only %__GFP_NOFAIL is supported; %__GFP_NORETRY
+ * and %__GFP_RETRY_MAYFAIL are not supported.
*
- * Map them into contiguous kernel virtual space, using a pagetable
- * protection of @prot.
+ * %__GFP_NOWARN can be used to suppress failure messages.
*
+ * Can not be called from interrupt nor NMI contexts.
* Return: the address of the area or %NULL on failure
*/
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
@@ -3946,11 +4100,8 @@ fail:
* Allocate enough pages to cover @size from the page level allocator with
* @gfp_mask flags. Map them into contiguous kernel virtual space.
*
- * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
- * and __GFP_NOFAIL are not supported
- *
- * Any use of gfp flags outside of GFP_KERNEL should be consulted
- * with mm people.
+ * Semantics of @gfp_mask (including reclaim/retry modifiers such as
+ * __GFP_NOFAIL) are the same as in __vmalloc_node_range_noprof().
*
* Return: pointer to the allocated memory or %NULL on error
*/
@@ -3971,6 +4122,8 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
+ if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED))
+ gfp_mask = vmalloc_fix_flags(gfp_mask);
return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
}
@@ -4010,6 +4163,8 @@ EXPORT_SYMBOL(vmalloc_noprof);
*/
void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
+ if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED))
+ gfp_mask = vmalloc_fix_flags(gfp_mask);
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
node, __builtin_return_address(0));
@@ -5055,7 +5210,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
unsigned int *counters;
if (IS_ENABLED(CONFIG_NUMA))
- counters = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ counters = kmalloc_array(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
for_each_vmap_node(vn) {
spin_lock(&vn->busy.lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bb4a96c7b682..900c74b6aa62 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -477,17 +477,6 @@ static int reclaimer_offset(struct scan_control *sc)
return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
-static inline int is_page_cache_freeable(struct folio *folio)
-{
- /*
- * A freeable page cache folio is referenced only by the caller
- * that isolated the folio, the page cache and optional filesystem
- * private data at folio->private.
- */
- return folio_ref_count(folio) - folio_test_private(folio) ==
- 1 + folio_nr_pages(folio);
-}
-
/*
* We detected a synchronous write error writing a folio out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
@@ -696,24 +685,12 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
+ *
+ * A freeable shmem or swapcache folio is referenced only by the
+ * caller that isolated the folio and the page cache.
*/
- if (!is_page_cache_freeable(folio))
- return PAGE_KEEP;
- if (!mapping) {
- /*
- * Some data journaling orphaned folios can have
- * folio->mapping == NULL while being dirty with clean buffers.
- */
- if (folio_test_private(folio)) {
- if (try_to_free_buffers(folio)) {
- folio_clear_dirty(folio);
- pr_info("%s: orphaned folio\n", __func__);
- return PAGE_CLEAN;
- }
- }
+ if (folio_ref_count(folio) != 1 + folio_nr_pages(folio) || !mapping)
return PAGE_KEEP;
- }
-
if (!shmem_mapping(mapping) && !folio_test_anon(folio))
return PAGE_ACTIVATE;
if (!folio_clear_dirty_for_io(folio))
@@ -1054,7 +1031,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
* When this happens, 'page' will likely just be discarded
* instead of migrated.
*/
- .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
.nmask = &allowed_mask,
@@ -1318,7 +1295,7 @@ retry:
split_folio_to_list(folio, folio_list))
goto activate_locked;
}
- if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN)) {
+ if (folio_alloc_swap(folio)) {
int __maybe_unused order = folio_order(folio);
if (!folio_test_large(folio))
@@ -1334,7 +1311,7 @@ retry:
}
#endif
count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK);
- if (folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOWARN))
+ if (folio_alloc_swap(folio))
goto activate_locked_split;
}
/*
@@ -1409,21 +1386,7 @@ retry:
mapping = folio_mapping(folio);
if (folio_test_dirty(folio)) {
- /*
- * Only kswapd can writeback filesystem folios
- * to avoid risk of stack overflow. But avoid
- * injecting inefficient single-folio I/O into
- * flusher writeback as much as possible: only
- * write folios when we've encountered many
- * dirty folios, and when we've already scanned
- * the rest of the LRU for clean folios and see
- * the same dirty folios again (with the reclaim
- * flag set).
- */
- if (folio_is_file_lru(folio) &&
- (!current_is_kswapd() ||
- !folio_test_reclaim(folio) ||
- !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
+ if (folio_is_file_lru(folio)) {
/*
* Immediately reclaim when written back.
* Similar in principle to folio_deactivate()
@@ -1432,7 +1395,8 @@ retry:
*/
node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
nr_pages);
- folio_set_reclaim(folio);
+ if (!folio_test_reclaim(folio))
+ folio_set_reclaim(folio);
goto activate_locked;
}
@@ -2054,7 +2018,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
- __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
+ mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
@@ -3773,7 +3737,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
pud = pud_offset(p4d, start & P4D_MASK);
restart:
for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
- pud_t val = READ_ONCE(pud[i]);
+ pud_t val = pudp_get(pud + i);
next = pud_addr_end(addr, end);
@@ -4780,7 +4744,7 @@ retry:
reset_batch_size(walk);
}
- __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
+ mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(sc),
stat.nr_demoted);
item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
@@ -6127,11 +6091,6 @@ again:
if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
- /* Allow kswapd to start writing pages during reclaim.*/
- if (sc->nr.unqueued_dirty &&
- sc->nr.unqueued_dirty == sc->nr.file_taken)
- set_bit(PGDAT_DIRTY, &pgdat->flags);
-
/*
* If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
@@ -6872,7 +6831,6 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
clear_bit(LRUVEC_NODE_CONGESTED, &lruvec->flags);
clear_bit(LRUVEC_CGROUP_CONGESTED, &lruvec->flags);
- clear_bit(PGDAT_DIRTY, &pgdat->flags);
clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
@@ -7169,7 +7127,12 @@ restart:
goto restart;
}
- if (!sc.nr_reclaimed)
+ /*
+ * If the reclaim was boosted, we might still be far from the
+ * watermark_high at this point. We need to avoid increasing the
+ * failure count to prevent the kswapd thread from stopping.
+ */
+ if (!sc.nr_reclaimed && !boosted)
atomic_inc(&pgdat->kswapd_failures);
out:
@@ -7623,9 +7586,11 @@ static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
else
nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
- /* If we can't clean pages, remove dirty pages from consideration */
- if (!(node_reclaim_mode & RECLAIM_WRITE))
- delta += node_page_state(pgdat, NR_FILE_DIRTY);
+ /*
+ * Since we can't clean folios through reclaim, remove dirty file
+ * folios from consideration.
+ */
+ delta += node_page_state(pgdat, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
diff --git a/mm/vmstat.c b/mm/vmstat.c
index bb09c032eecf..65de88cdf40e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -392,7 +392,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
delta >>= PAGE_SHIFT;
}
- /* See __mod_node_page_state */
+ /* See __mod_zone_page_state() */
preempt_disable_nested();
x = delta + __this_cpu_read(*p);
@@ -438,7 +438,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
- /* See __mod_node_page_state */
+ /* See __mod_zone_page_state() */
preempt_disable_nested();
v = __this_cpu_inc_return(*p);
@@ -461,7 +461,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
- /* See __mod_node_page_state */
+ /* See __mod_zone_page_state() */
preempt_disable_nested();
v = __this_cpu_inc_return(*p);
@@ -494,7 +494,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
s8 __percpu *p = pcp->vm_stat_diff + item;
s8 v, t;
- /* See __mod_node_page_state */
+ /* See __mod_zone_page_state() */
preempt_disable_nested();
v = __this_cpu_dec_return(*p);
@@ -517,7 +517,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
- /* See __mod_node_page_state */
+ /* See __mod_zone_page_state() */
preempt_disable_nested();
v = __this_cpu_dec_return(*p);
@@ -771,25 +771,28 @@ EXPORT_SYMBOL(dec_node_page_state);
/*
* Fold a differential into the global counters.
- * Returns the number of counters updated.
+ * Returns whether counters were updated.
*/
static int fold_diff(int *zone_diff, int *node_diff)
{
int i;
- int changes = 0;
+ bool changed = false;
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
if (zone_diff[i]) {
atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
- changes++;
+ changed = true;
+ }
}
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
if (node_diff[i]) {
atomic_long_add(node_diff[i], &vm_node_stat[i]);
- changes++;
+ changed = true;
+ }
}
- return changes;
+
+ return changed;
}
/*
@@ -806,16 +809,16 @@ static int fold_diff(int *zone_diff, int *node_diff)
* with the global counters. These could cause remote node cache line
* bouncing and will have to be only done when necessary.
*
- * The function returns the number of global counters updated.
+ * The function returns whether global counters were updated.
*/
-static int refresh_cpu_vm_stats(bool do_pagesets)
+static bool refresh_cpu_vm_stats(bool do_pagesets)
{
struct pglist_data *pgdat;
struct zone *zone;
int i;
int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
- int changes = 0;
+ bool changed = false;
for_each_populated_zone(zone) {
struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
@@ -839,7 +842,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
if (do_pagesets) {
cond_resched();
- changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
+ if (decay_pcp_high(zone, this_cpu_ptr(pcp)))
+ changed = true;
#ifdef CONFIG_NUMA
/*
* Deal with draining the remote pageset of this
@@ -861,13 +865,13 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
}
if (__this_cpu_dec_return(pcp->expire)) {
- changes++;
+ changed = true;
continue;
}
if (__this_cpu_read(pcp->count)) {
drain_zone_pages(zone, this_cpu_ptr(pcp));
- changes++;
+ changed = true;
}
#endif
}
@@ -887,8 +891,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
}
}
- changes += fold_diff(global_zone_diff, global_node_diff);
- return changes;
+ if (fold_diff(global_zone_diff, global_node_diff))
+ changed = true;
+ return changed;
}
/*
@@ -1847,9 +1852,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
seq_printf(m,
"\n node_unreclaimable: %u"
- "\n start_pfn: %lu",
+ "\n start_pfn: %lu"
+ "\n reserved_highatomic: %lu"
+ "\n free_highatomic: %lu",
atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES,
- zone->zone_start_pfn);
+ zone->zone_start_pfn,
+ zone->nr_reserved_highatomic,
+ zone->nr_free_highatomic);
seq_putc(m, '\n');
}
diff --git a/mm/workingset.c b/mm/workingset.c
index d32dc2e02a61..e9f05634747a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -749,7 +749,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
xa_delete_node(node, workingset_update_node);
- __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
+ mod_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM, 1);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
diff --git a/mm/zswap.c b/mm/zswap.c
index c1af782e54ec..5d0f8b13a958 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -175,7 +175,7 @@ static struct shrinker *zswap_shrinker;
* This structure contains the metadata for tracking a single compressed
* page within zswap.
*
- * swpentry - associated swap entry, the offset indexes into the red-black tree
+ * swpentry - associated swap entry, the offset indexes into the xarray
* length - the length in bytes of the compressed page data. Needed during
* decompression.
* referenced - true if the entry recently entered the zswap pool. Unset by the
@@ -879,7 +879,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
* acomp instance, then get those requests done simultaneously. but in this
* case, zswap actually does store and load page by page, there is no
* existing method to send the second page before the first page is done
- * in one thread doing zwap.
+ * in one thread doing zswap.
* but in different threads running on different cpu, we have different
* acomp instance, so multiple threads can do (de)compression in parallel.
*/
@@ -894,7 +894,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
* to the active LRU list in the case.
*/
if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
- dlen = PAGE_SIZE;
if (!mem_cgroup_zswap_writeback_enabled(
folio_memcg(page_folio(page)))) {
comp_ret = comp_ret ? comp_ret : -EINVAL;
@@ -1129,7 +1128,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
*
* 1. We extract the swp_entry_t to the stack, allowing
* zswap_writeback_entry() to pin the swap entry and
- * then validate the zwap entry against that swap entry's
+ * then validate the zswap entry against that swap entry's
* tree using pointer value comparison. Only when that
* is successful can the entry be dereferenced.
*