summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/cpuset.h9
-rw-r--r--include/linux/damon.h6
-rw-r--r--include/linux/fs.h25
-rw-r--r--include/linux/gfp.h8
-rw-r--r--include/linux/huge_mm.h17
-rw-r--r--include/linux/hugetlb.h15
-rw-r--r--include/linux/kexec.h5
-rw-r--r--include/linux/kexec_handover.h109
-rw-r--r--include/linux/khugepaged.h8
-rw-r--r--include/linux/maple_tree.h4
-rw-r--r--include/linux/memblock.h41
-rw-r--r--include/linux/memcontrol.h55
-rw-r--r--include/linux/memory.h10
-rw-r--r--include/linux/mempolicy.h4
-rw-r--r--include/linux/mm.h357
-rw-r--r--include/linux/mm_inline.h2
-rw-r--r--include/linux/mm_types.h36
-rw-r--r--include/linux/mmap_lock.h227
-rw-r--r--include/linux/mmzone.h88
-rw-r--r--include/linux/numa_memblks.h1
-rw-r--r--include/linux/page-flags-layout.h4
-rw-r--r--include/linux/page-flags.h21
-rw-r--r--include/linux/pagemap.h91
-rw-r--r--include/linux/percpu-defs.h2
-rw-r--r--include/linux/pgtable.h131
-rw-r--r--include/linux/ptdump.h15
-rw-r--r--include/linux/rmap.h2
-rw-r--r--include/linux/swap.h12
-rw-r--r--include/linux/uprobes.h6
-rw-r--r--include/linux/util_macros.h3
-rw-r--r--include/linux/xarray.h24
-rw-r--r--include/linux/zpool.h4
-rw-r--r--include/linux/zsmalloc.h3
33 files changed, 840 insertions, 505 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 5466c96a33db..2ddb256187b5 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -82,11 +82,11 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
void cpuset_init_current_mems_allowed(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
-extern bool cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool cpuset_current_node_allowed(int node, gfp_t gfp_mask);
static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
- return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+ return cpuset_current_node_allowed(zone_to_nid(z), gfp_mask);
}
static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
@@ -173,6 +173,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
task_unlock(current);
}
+extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
#else /* !CONFIG_CPUSETS */
static inline bool cpusets_enabled(void) { return false; }
@@ -293,6 +294,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
return false;
}
+static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+ return true;
+}
#endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 47e36e6ea203..a4011726cb3b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -145,6 +145,8 @@ enum damos_action {
*
* @DAMOS_QUOTA_USER_INPUT: User-input value.
* @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us.
+ * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node.
+ * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node.
* @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics.
*
* Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
@@ -152,6 +154,8 @@ enum damos_action {
enum damos_quota_goal_metric {
DAMOS_QUOTA_USER_INPUT,
DAMOS_QUOTA_SOME_MEM_PSI_US,
+ DAMOS_QUOTA_NODE_MEM_USED_BP,
+ DAMOS_QUOTA_NODE_MEM_FREE_BP,
NR_DAMOS_QUOTA_GOAL_METRICS,
};
@@ -161,6 +165,7 @@ enum damos_quota_goal_metric {
* @target_value: Target value of @metric to achieve with the tuning.
* @current_value: Current value of @metric.
* @last_psi_total: Last measured total PSI
+ * @nid: Node id.
* @list: List head for siblings.
*
* Data structure for getting the current score of the quota tuning goal. The
@@ -179,6 +184,7 @@ struct damos_quota_goal {
/* metric-dependent fields */
union {
u64 last_psi_total;
+ int nid;
};
struct list_head list;
};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 27c1eb1f8b37..da86fcb11882 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2189,6 +2189,7 @@ struct file_operations {
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
unsigned int poll_flags);
+ int (*mmap_prepare)(struct vm_area_desc *);
} __randomize_layout;
/* Supports async buffered reads */
@@ -2258,11 +2259,35 @@ struct inode_operations {
struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
} ____cacheline_aligned;
+/* Did the driver provide valid mmap hook configuration? */
+static inline bool file_has_valid_mmap_hooks(struct file *file)
+{
+ bool has_mmap = file->f_op->mmap;
+ bool has_mmap_prepare = file->f_op->mmap_prepare;
+
+ /* Hooks are mutually exclusive. */
+ if (WARN_ON_ONCE(has_mmap && has_mmap_prepare))
+ return false;
+ if (!has_mmap && !has_mmap_prepare)
+ return false;
+
+ return true;
+}
+
static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
+ if (WARN_ON_ONCE(file->f_op->mmap_prepare))
+ return -EINVAL;
+
return file->f_op->mmap(file, vma);
}
+static inline int __call_mmap_prepare(struct file *file,
+ struct vm_area_desc *desc)
+{
+ return file->f_op->mmap_prepare(desc);
+}
+
extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c9fa6309c903..be160e8d8bcb 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -45,13 +45,13 @@ static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
* !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
* !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
* All GFP_* flags including GFP_NOWAIT use one or both flags.
- * try_alloc_pages() is the only API that doesn't specify either flag.
+ * alloc_pages_nolock() is the only API that doesn't specify either flag.
*
* This is stronger than GFP_NOWAIT or GFP_ATOMIC because
* those are guaranteed to never block on a sleeping lock.
* Here we are enforcing that the allocation doesn't ever spin
* on any locks (i.e. only trylocks). There is no high level
- * GFP_$FOO flag for this use in try_alloc_pages() as the
+ * GFP_$FOO flag for this use in alloc_pages_nolock() as the
* regular page allocator doesn't fully support this
* allocation mode.
*/
@@ -354,8 +354,8 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
}
#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
-struct page *try_alloc_pages_noprof(int nid, unsigned int order);
-#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__))
+struct page *alloc_pages_nolock_noprof(int nid, unsigned int order);
+#define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__))
extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__))
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e893d546a49f..2f190c90192d 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -395,7 +395,7 @@ static inline int split_huge_page(struct page *page)
void deferred_split_folio(struct folio *folio, bool partially_mapped);
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address, bool freeze, struct folio *folio);
+ unsigned long address, bool freeze);
#define split_huge_pmd(__vma, __pmd, __address) \
do { \
@@ -403,12 +403,11 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \
|| pmd_devmap(*____pmd)) \
__split_huge_pmd(__vma, __pmd, __address, \
- false, NULL); \
+ false); \
} while (0)
-
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
- bool freeze, struct folio *folio);
+ bool freeze);
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
unsigned long address);
@@ -495,15 +494,13 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);
-#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
-
static inline bool thp_migration_supported(void)
{
return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}
void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd, bool freeze, struct folio *folio);
+ pmd_t *pmd, bool freeze);
bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct folio *folio);
@@ -578,12 +575,12 @@ static inline void deferred_split_folio(struct folio *folio, bool partially_mapp
do { } while (0)
static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address, bool freeze, struct folio *folio) {}
+ unsigned long address, bool freeze) {}
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
- unsigned long address, bool freeze, struct folio *folio) {}
+ unsigned long address, bool freeze) {}
static inline void split_huge_pmd_locked(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
- bool freeze, struct folio *folio) {}
+ bool freeze) {}
static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4861a7e304bb..0598f36931de 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -14,6 +14,7 @@
#include <linux/pgtable.h>
#include <linux/gfp.h>
#include <linux/userfaultfd_k.h>
+#include <linux/nodemask.h>
struct ctl_table;
struct user_struct;
@@ -128,12 +129,12 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
struct vm_area_struct *, struct vm_area_struct *);
void unmap_hugepage_range(struct vm_area_struct *,
- unsigned long, unsigned long, struct page *,
- zap_flags_t);
+ unsigned long start, unsigned long end,
+ struct folio *, zap_flags_t);
void __unmap_hugepage_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- struct page *ref_page, zap_flags_t zap_flags);
+ struct folio *, zap_flags_t zap_flags);
void hugetlb_report_meminfo(struct seq_file *);
int hugetlb_report_node_meminfo(char *buf, int len, int nid);
void hugetlb_show_meminfo_node(int nid);
@@ -176,6 +177,8 @@ extern struct list_head huge_boot_pages[MAX_NUMNODES];
void hugetlb_bootmem_alloc(void);
bool hugetlb_bootmem_allocated(void);
+extern nodemask_t hugetlb_bootmem_nodes;
+void hugetlb_bootmem_set_nodes(void);
/* arch callbacks */
@@ -453,7 +456,7 @@ static inline long hugetlb_change_protection(
static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page,
+ unsigned long end, struct folio *folio,
zap_flags_t zap_flags)
{
BUG();
@@ -700,7 +703,7 @@ struct huge_bootmem_page {
bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);
-int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
+int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list);
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
void wait_for_freed_hugetlb_folios(void);
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
@@ -1088,7 +1091,7 @@ static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
return NULL;
}
-static inline int isolate_or_dissolve_huge_page(struct page *page,
+static inline int isolate_or_dissolve_huge_folio(struct folio *folio,
struct list_head *list)
{
return -ENOMEM;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 53ef1b6c8712..f03ee2b40816 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -374,6 +374,11 @@ struct kimage {
bool is_ima_segment_index_set;
#endif
+ struct {
+ struct kexec_segment *scratch;
+ phys_addr_t fdt;
+ } kho;
+
/* Core ELF header buffer */
void *elf_headers;
unsigned long elf_headers_sz;
diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
new file mode 100644
index 000000000000..348844cffb13
--- /dev/null
+++ b/include/linux/kexec_handover.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_KEXEC_HANDOVER_H
+#define LINUX_KEXEC_HANDOVER_H
+
+#include <linux/types.h>
+#include <linux/errno.h>
+
+struct kho_scratch {
+ phys_addr_t addr;
+ phys_addr_t size;
+};
+
+/* KHO Notifier index */
+enum kho_event {
+ KEXEC_KHO_FINALIZE = 0,
+ KEXEC_KHO_ABORT = 1,
+};
+
+struct folio;
+struct notifier_block;
+
+#define DECLARE_KHOSER_PTR(name, type) \
+ union { \
+ phys_addr_t phys; \
+ type ptr; \
+ } name
+#define KHOSER_STORE_PTR(dest, val) \
+ ({ \
+ typeof(val) v = val; \
+ typecheck(typeof((dest).ptr), v); \
+ (dest).phys = virt_to_phys(v); \
+ })
+#define KHOSER_LOAD_PTR(src) \
+ ({ \
+ typeof(src) s = src; \
+ (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \
+ })
+
+struct kho_serialization;
+
+#ifdef CONFIG_KEXEC_HANDOVER
+bool kho_is_enabled(void);
+
+int kho_preserve_folio(struct folio *folio);
+int kho_preserve_phys(phys_addr_t phys, size_t size);
+struct folio *kho_restore_folio(phys_addr_t phys);
+int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
+int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
+
+int register_kho_notifier(struct notifier_block *nb);
+int unregister_kho_notifier(struct notifier_block *nb);
+
+void kho_memory_init(void);
+
+void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys,
+ u64 scratch_len);
+#else
+static inline bool kho_is_enabled(void)
+{
+ return false;
+}
+
+static inline int kho_preserve_folio(struct folio *folio)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int kho_preserve_phys(phys_addr_t phys, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline struct folio *kho_restore_folio(phys_addr_t phys)
+{
+ return NULL;
+}
+
+static inline int kho_add_subtree(struct kho_serialization *ser,
+ const char *name, void *fdt)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int register_kho_notifier(struct notifier_block *nb)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int unregister_kho_notifier(struct notifier_block *nb)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void kho_memory_init(void)
+{
+}
+
+static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
+ phys_addr_t scratch_phys, u64 scratch_len)
+{
+}
+#endif /* CONFIG_KEXEC_HANDOVER */
+
+#endif /* LINUX_KEXEC_HANDOVER_H */
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 1f46046080f5..b8d69cfbb58b 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -15,16 +15,8 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma,
unsigned long vm_flags);
extern void khugepaged_min_free_kbytes_update(void);
extern bool current_is_khugepaged(void);
-#ifdef CONFIG_SHMEM
extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd);
-#else
-static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr, bool install_pmd)
-{
- return 0;
-}
-#endif
static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index cbbcd18d4186..9ef129038224 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -463,6 +463,8 @@ struct ma_wr_state {
void __rcu **slots; /* mas->node->slots pointer */
void *entry; /* The entry to write */
void *content; /* The existing entry that is being overwritten */
+ unsigned char vacant_height; /* Height of lowest node with free space */
+ unsigned char sufficient_height;/* Height of lowest node with min sufficiency + 1 nodes */
};
#define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock))
@@ -498,6 +500,8 @@ struct ma_wr_state {
.mas = ma_state, \
.content = NULL, \
.entry = wr_entry, \
+ .vacant_height = 0, \
+ .sufficient_height = 0 \
}
#define MA_TOPIARY(name, tree) \
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ef5a1ecc6e59..bb19a2534224 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -42,6 +42,14 @@ extern unsigned long long max_possible_pfn;
* kernel resource tree.
* @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are
* not initialized (only for reserved regions).
+ * @MEMBLOCK_RSRV_KERN: memory region that is reserved for kernel use,
+ * either explictitly with memblock_reserve_kern() or via memblock
+ * allocation APIs. All memblock allocations set this flag.
+ * @MEMBLOCK_KHO_SCRATCH: memory region that kexec can pass to the next
+ * kernel in handover mode. During early boot, we do not know about all
+ * memory reservations yet, so we get scratch memory from the previous
+ * kernel that we know is good to use. It is the only memory that
+ * allocations may happen from in this phase.
*/
enum memblock_flags {
MEMBLOCK_NONE = 0x0, /* No special request */
@@ -50,6 +58,8 @@ enum memblock_flags {
MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */
MEMBLOCK_RSRV_NOINIT = 0x10, /* don't initialize struct pages */
+ MEMBLOCK_RSRV_KERN = 0x20, /* memory reserved for kernel use */
+ MEMBLOCK_KHO_SCRATCH = 0x40, /* scratch memory for kexec handover */
};
/**
@@ -116,7 +126,19 @@ int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid,
int memblock_add(phys_addr_t base, phys_addr_t size);
int memblock_remove(phys_addr_t base, phys_addr_t size);
int memblock_phys_free(phys_addr_t base, phys_addr_t size);
-int memblock_reserve(phys_addr_t base, phys_addr_t size);
+int __memblock_reserve(phys_addr_t base, phys_addr_t size, int nid,
+ enum memblock_flags flags);
+
+static __always_inline int memblock_reserve(phys_addr_t base, phys_addr_t size)
+{
+ return __memblock_reserve(base, size, NUMA_NO_NODE, 0);
+}
+
+static __always_inline int memblock_reserve_kern(phys_addr_t base, phys_addr_t size)
+{
+ return __memblock_reserve(base, size, NUMA_NO_NODE, MEMBLOCK_RSRV_KERN);
+}
+
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
int memblock_physmem_add(phys_addr_t base, phys_addr_t size);
#endif
@@ -132,6 +154,8 @@ int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);
+int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size);
+int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size);
void memblock_free(void *ptr, size_t size);
void reset_all_zones_managed_pages(void);
@@ -275,6 +299,11 @@ static inline bool memblock_is_driver_managed(struct memblock_region *m)
return m->flags & MEMBLOCK_DRIVER_MANAGED;
}
+static inline bool memblock_is_kho_scratch(struct memblock_region *m)
+{
+ return m->flags & MEMBLOCK_KHO_SCRATCH;
+}
+
int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
unsigned long *end_pfn);
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
@@ -476,6 +505,7 @@ static inline __init_memblock bool memblock_bottom_up(void)
phys_addr_t memblock_phys_mem_size(void);
phys_addr_t memblock_reserved_size(void);
+phys_addr_t memblock_reserved_kern_size(phys_addr_t limit, int nid);
unsigned long memblock_estimated_nr_free_pages(void);
phys_addr_t memblock_start_of_DRAM(void);
phys_addr_t memblock_end_of_DRAM(void);
@@ -602,5 +632,14 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end) { }
static inline void memtest_report_meminfo(struct seq_file *m) { }
#endif
+#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH
+void memblock_set_kho_scratch_only(void);
+void memblock_clear_kho_scratch_only(void);
+void memmap_init_kho_scratch_pages(void);
+#else
+static inline void memblock_set_kho_scratch_only(void) { }
+static inline void memblock_clear_kho_scratch_only(void) { }
+static inline void memmap_init_kho_scratch_pages(void) {}
+#endif
#endif /* _LINUX_MEMBLOCK_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 53364526d877..f7848f73f41c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -903,19 +903,9 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
struct mem_cgroup *oom_domain);
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
-void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
- int val);
-
/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void mod_memcg_state(struct mem_cgroup *memcg,
- enum memcg_stat_item idx, int val)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __mod_memcg_state(memcg, idx, val);
- local_irq_restore(flags);
-}
+void mod_memcg_state(struct mem_cgroup *memcg,
+ enum memcg_stat_item idx, int val);
static inline void mod_memcg_page_state(struct page *page,
enum memcg_stat_item idx, int val)
@@ -952,19 +942,8 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
local_irq_restore(flags);
}
-void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
- unsigned long count);
-
-static inline void count_memcg_events(struct mem_cgroup *memcg,
- enum vm_event_item idx,
- unsigned long count)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __count_memcg_events(memcg, idx, count);
- local_irq_restore(flags);
-}
+void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+ unsigned long count);
static inline void count_memcg_folio_events(struct folio *folio,
enum vm_event_item idx, unsigned long nr)
@@ -1057,6 +1036,7 @@ static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
return id;
}
+extern int mem_cgroup_init(void);
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
@@ -1374,12 +1354,6 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
}
-static inline void __mod_memcg_state(struct mem_cgroup *memcg,
- enum memcg_stat_item idx,
- int nr)
-{
-}
-
static inline void mod_memcg_state(struct mem_cgroup *memcg,
enum memcg_stat_item idx,
int nr)
@@ -1433,12 +1407,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
}
static inline void count_memcg_events(struct mem_cgroup *memcg,
- enum vm_event_item idx,
- unsigned long count)
-{
-}
-
-static inline void __count_memcg_events(struct mem_cgroup *memcg,
enum vm_event_item idx,
unsigned long count)
{
@@ -1472,6 +1440,8 @@ static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
{
return 0;
}
+
+static inline int mem_cgroup_init(void) { return 0; }
#endif /* CONFIG_MEMCG */
/*
@@ -1736,6 +1706,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
rcu_read_unlock();
}
+bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
+
#else
static inline bool mem_cgroup_kmem_disabled(void)
{
@@ -1793,6 +1765,15 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
{
}
+static inline ino_t page_cgroup_ino(struct page *page)
+{
+ return 0;
+}
+
+static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
+{
+ return true;
+}
#endif /* CONFIG_MEMCG */
#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 12daa6ec7d09..5ec4e6d209b9 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -149,6 +149,14 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
{
return 0;
}
+static inline int memory_block_advise_max_size(unsigned long size)
+{
+ return -ENODEV;
+}
+static inline unsigned long memory_block_advised_max_size(void)
+{
+ return 0;
+}
#else /* CONFIG_MEMORY_HOTPLUG */
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
@@ -181,6 +189,8 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
void memory_block_add_nid(struct memory_block *mem, int nid,
enum meminit_context context);
#endif /* CONFIG_NUMA */
+int memory_block_advise_max_size(unsigned long size);
+unsigned long memory_block_advised_max_size(void);
#endif /* CONFIG_MEMORY_HOTPLUG */
/*
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ce9885e0178a..0fe96f3ab3ef 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
+#include <linux/node.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>
@@ -178,6 +179,9 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol)
extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
+extern int mempolicy_set_node_perf(unsigned int node,
+ struct access_coordinate *coords);
+
#else
struct mempolicy {};
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e51dba8398f7..9e221ffcb868 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -12,6 +12,7 @@
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
+#include <linux/compiler.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
@@ -356,9 +357,7 @@ extern unsigned int kobjsize(const void *objp);
# define VM_SHADOW_STACK VM_NONE
#endif
-#if defined(CONFIG_X86)
-# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
-#elif defined(CONFIG_PPC64)
+#if defined(CONFIG_PPC64)
# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP VM_ARCH_1
@@ -670,204 +669,11 @@ static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_PER_VMA_LOCK
-static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- static struct lock_class_key lockdep_key;
-
- lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
-#endif
- if (reset_refcnt)
- refcount_set(&vma->vm_refcnt, 0);
- vma->vm_lock_seq = UINT_MAX;
-}
-
-static inline bool is_vma_writer_only(int refcnt)
-{
- /*
- * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
- * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
- * a detached vma happens only in vma_mark_detached() and is a rare
- * case, therefore most of the time there will be no unnecessary wakeup.
- */
- return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
-}
-
-static inline void vma_refcount_put(struct vm_area_struct *vma)
-{
- /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
- struct mm_struct *mm = vma->vm_mm;
- int oldcnt;
-
- rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
- if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
-
- if (is_vma_writer_only(oldcnt - 1))
- rcuwait_wake_up(&mm->vma_writer_wait);
- }
-}
-
-/*
- * Try to read-lock a vma. The function is allowed to occasionally yield false
- * locked result to avoid performance overhead, in which case we fall back to
- * using mmap_lock. The function should never yield false unlocked result.
- * False locked result is possible if mm_lock_seq overflows or if vma gets
- * reused and attached to a different mm before we lock it.
- * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
- * detached.
- */
-static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
- struct vm_area_struct *vma)
-{
- int oldcnt;
-
- /*
- * Check before locking. A race might cause false locked result.
- * We can use READ_ONCE() for the mm_lock_seq here, and don't need
- * ACQUIRE semantics, because this is just a lockless check whose result
- * we don't rely on for anything - the mm_lock_seq read against which we
- * need ordering is below.
- */
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
- return NULL;
-
- /*
- * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
- * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
- * Acquire fence is required here to avoid reordering against later
- * vm_lock_seq check and checks inside lock_vma_under_rcu().
- */
- if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
- VMA_REF_LIMIT))) {
- /* return EAGAIN if vma got detached from under us */
- return oldcnt ? NULL : ERR_PTR(-EAGAIN);
- }
-
- rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
- /*
- * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
- * False unlocked result is impossible because we modify and check
- * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
- * modification invalidates all existing locks.
- *
- * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
- * racing with vma_end_write_all(), we only start reading from the VMA
- * after it has been unlocked.
- * This pairs with RELEASE semantics in vma_end_write_all().
- */
- if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
- vma_refcount_put(vma);
- return NULL;
- }
-
- return vma;
-}
-
/*
- * Use only while holding mmap read lock which guarantees that locking will not
- * fail (nobody can concurrently write-lock the vma). vma_start_read() should
- * not be used in such cases because it might fail due to mm_lock_seq overflow.
- * This functionality is used to obtain vma read lock and drop the mmap read lock.
+ * These must be here rather than mmap_lock.h as dependent on vm_fault type,
+ * declared in this header.
*/
-static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
-{
- int oldcnt;
-
- mmap_assert_locked(vma->vm_mm);
- if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
- VMA_REF_LIMIT)))
- return false;
-
- rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
- return true;
-}
-
-/*
- * Use only while holding mmap read lock which guarantees that locking will not
- * fail (nobody can concurrently write-lock the vma). vma_start_read() should
- * not be used in such cases because it might fail due to mm_lock_seq overflow.
- * This functionality is used to obtain vma read lock and drop the mmap read lock.
- */
-static inline bool vma_start_read_locked(struct vm_area_struct *vma)
-{
- return vma_start_read_locked_nested(vma, 0);
-}
-
-static inline void vma_end_read(struct vm_area_struct *vma)
-{
- vma_refcount_put(vma);
-}
-
-/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
-static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
-{
- mmap_assert_write_locked(vma->vm_mm);
-
- /*
- * current task is holding mmap_write_lock, both vma->vm_lock_seq and
- * mm->mm_lock_seq can't be concurrently modified.
- */
- *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
- return (vma->vm_lock_seq == *mm_lock_seq);
-}
-
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
-
-/*
- * Begin writing to a VMA.
- * Exclude concurrent readers under the per-VMA lock until the currently
- * write-locked mmap_lock is dropped or downgraded.
- */
-static inline void vma_start_write(struct vm_area_struct *vma)
-{
- unsigned int mm_lock_seq;
-
- if (__is_vma_write_locked(vma, &mm_lock_seq))
- return;
-
- __vma_start_write(vma, mm_lock_seq);
-}
-
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
-{
- unsigned int mm_lock_seq;
-
- VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
-}
-
-static inline void vma_assert_locked(struct vm_area_struct *vma)
-{
- unsigned int mm_lock_seq;
-
- VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
- !__is_vma_write_locked(vma, &mm_lock_seq), vma);
-}
-
-/*
- * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
- * assertions should be made either under mmap_write_lock or when the object
- * has been isolated under mmap_write_lock, ensuring no competing writers.
- */
-static inline void vma_assert_attached(struct vm_area_struct *vma)
-{
- WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
-}
-
-static inline void vma_assert_detached(struct vm_area_struct *vma)
-{
- WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
-}
-
-static inline void vma_mark_attached(struct vm_area_struct *vma)
-{
- vma_assert_write_locked(vma);
- vma_assert_detached(vma);
- refcount_set_release(&vma->vm_refcnt, 1);
-}
-
-void vma_mark_detached(struct vm_area_struct *vma);
-
+#ifdef CONFIG_PER_VMA_LOCK
static inline void release_fault_lock(struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_VMA_LOCK)
@@ -883,36 +689,7 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
else
mmap_assert_locked(vmf->vma->vm_mm);
}
-
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address);
-
-#else /* CONFIG_PER_VMA_LOCK */
-
-static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
-static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
- struct vm_area_struct *vma)
- { return NULL; }
-static inline void vma_end_read(struct vm_area_struct *vma) {}
-static inline void vma_start_write(struct vm_area_struct *vma) {}
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
- { mmap_assert_write_locked(vma->vm_mm); }
-static inline void vma_assert_attached(struct vm_area_struct *vma) {}
-static inline void vma_assert_detached(struct vm_area_struct *vma) {}
-static inline void vma_mark_attached(struct vm_area_struct *vma) {}
-static inline void vma_mark_detached(struct vm_area_struct *vma) {}
-
-static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
-{
- return NULL;
-}
-
-static inline void vma_assert_locked(struct vm_area_struct *vma)
-{
- mmap_assert_locked(vma->vm_mm);
-}
-
+#else
static inline void release_fault_lock(struct vm_fault *vmf)
{
mmap_read_unlock(vmf->vma->vm_mm);
@@ -922,7 +699,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf)
{
mmap_assert_locked(vmf->vma->vm_mm);
}
-
#endif /* CONFIG_PER_VMA_LOCK */
extern const struct vm_operations_struct vma_dummy_vm_ops;
@@ -1459,7 +1235,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page);
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
struct page *page, unsigned int nr, unsigned long addr);
@@ -2004,6 +1780,45 @@ static inline struct folio *pfn_folio(unsigned long pfn)
return page_folio(pfn_to_page(pfn));
}
+#ifdef CONFIG_MMU
+static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
+{
+ return pfn_pte(page_to_pfn(page), pgprot);
+}
+
+/**
+ * folio_mk_pte - Create a PTE for this folio
+ * @folio: The folio to create a PTE for
+ * @pgprot: The page protection bits to use
+ *
+ * Create a page table entry for the first page of this folio.
+ * This is suitable for passing to set_ptes().
+ *
+ * Return: A page table entry suitable for mapping this folio.
+ */
+static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot)
+{
+ return pfn_pte(folio_pfn(folio), pgprot);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/**
+ * folio_mk_pmd - Create a PMD for this folio
+ * @folio: The folio to create a PMD for
+ * @pgprot: The page protection bits to use
+ *
+ * Create a page table entry for the first page of this folio.
+ * This is suitable for passing to set_pmd_at().
+ *
+ * Return: A page table entry suitable for mapping this folio.
+ */
+static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot)
+{
+ return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
+}
+#endif
+#endif /* CONFIG_MMU */
+
static inline bool folio_has_pincount(const struct folio *folio)
{
if (IS_ENABLED(CONFIG_64BIT))
@@ -2185,15 +2000,6 @@ static inline long compound_nr(struct page *page)
}
/**
- * thp_nr_pages - The number of regular pages in this huge page.
- * @page: The head page of a huge page.
- */
-static inline long thp_nr_pages(struct page *page)
-{
- return folio_nr_pages((struct folio *)page);
-}
-
-/**
* folio_next - Move to the next physical folio.
* @folio: The folio we're currently operating on.
*
@@ -2303,7 +2109,62 @@ static inline bool folio_maybe_mapped_shared(struct folio *folio)
*/
if (mapcount <= 1)
return false;
- return folio_test_large_maybe_mapped_shared(folio);
+ return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
+}
+
+/**
+ * folio_expected_ref_count - calculate the expected folio refcount
+ * @folio: the folio
+ *
+ * Calculate the expected folio refcount, taking references from the pagecache,
+ * swapcache, PG_private and page table mappings into account. Useful in
+ * combination with folio_ref_count() to detect unexpected references (e.g.,
+ * GUP or other temporary references).
+ *
+ * Does currently not consider references from the LRU cache. If the folio
+ * was isolated from the LRU (which is the case during migration or split),
+ * the LRU cache does not apply.
+ *
+ * Calling this function on an unmapped folio -- !folio_mapped() -- that is
+ * locked will return a stable result.
+ *
+ * Calling this function on a mapped folio will not result in a stable result,
+ * because nothing stops additional page table mappings from coming (e.g.,
+ * fork()) or going (e.g., munmap()).
+ *
+ * Calling this function without the folio lock will also not result in a
+ * stable result: for example, the folio might get dropped from the swapcache
+ * concurrently.
+ *
+ * However, even when called without the folio lock or on a mapped folio,
+ * this function can be used to detect unexpected references early (for example,
+ * if it makes sense to even lock the folio and unmap it).
+ *
+ * The caller must add any reference (e.g., from folio_try_get()) it might be
+ * holding itself to the result.
+ *
+ * Returns the expected folio refcount.
+ */
+static inline int folio_expected_ref_count(const struct folio *folio)
+{
+ const int order = folio_order(folio);
+ int ref_count = 0;
+
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
+ return 0;
+
+ if (folio_test_anon(folio)) {
+ /* One reference per page from the swapcache. */
+ ref_count += folio_test_swapcache(folio) << order;
+ } else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) {
+ /* One reference per page from the pagecache. */
+ ref_count += !!folio->mapping << order;
+ /* One reference from PG_private. */
+ ref_count += folio_test_private(folio);
+ }
+
+ /* One reference per page table mapping. */
+ return ref_count + folio_mapcount(folio);
}
#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
@@ -2406,7 +2267,6 @@ static inline void clear_page_pfmemalloc(struct page *page)
extern void pagefault_out_of_memory(void);
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
-#define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1))
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
/*
@@ -2767,7 +2627,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm)
{
unsigned long _rss = get_mm_rss(mm);
- if ((mm)->hiwater_rss < _rss)
+ if (data_race(mm->hiwater_rss) < _rss)
(mm)->hiwater_rss = _rss;
}
@@ -3117,9 +2977,10 @@ static inline void pagetable_dtor_free(struct ptdesc *ptdesc)
pagetable_free(ptdesc);
}
-static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
+static inline bool pagetable_pte_ctor(struct mm_struct *mm,
+ struct ptdesc *ptdesc)
{
- if (!ptlock_init(ptdesc))
+ if (mm != &init_mm && !ptlock_init(ptdesc))
return false;
__pagetable_ctor(ptdesc);
return true;
@@ -3223,9 +3084,10 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
return ptl;
}
-static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
+static inline bool pagetable_pmd_ctor(struct mm_struct *mm,
+ struct ptdesc *ptdesc)
{
- if (!pmd_ptlock_init(ptdesc))
+ if (mm != &init_mm && !pmd_ptlock_init(ptdesc))
return false;
ptdesc_pmd_pts_init(ptdesc);
__pagetable_ctor(ptdesc);
@@ -3414,7 +3276,6 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
-int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, bool write);
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f9157a0c42a5..89b518ff097e 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -447,6 +447,8 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
#endif /* CONFIG_ANON_VMA_NAME */
+void pfnmap_track_ctx_release(struct kref *ref);
+
static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
atomic_set(&mm->tlb_flush_pending, 0);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 32ba5126e221..d3cffd8828c9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -28,7 +28,6 @@
#endif
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
-#define INIT_PASID 0
struct address_space;
struct futex_private_hash;
@@ -765,6 +764,38 @@ struct vma_numab_state {
int prev_scan_seq;
};
+#ifdef __HAVE_PFNMAP_TRACKING
+struct pfnmap_track_ctx {
+ struct kref kref;
+ unsigned long pfn;
+ unsigned long size; /* in bytes */
+};
+#endif
+
+/*
+ * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
+ * manipulate mutable fields which will cause those fields to be updated in the
+ * resultant VMA.
+ *
+ * Helper functions are not required for manipulating any field.
+ */
+struct vm_area_desc {
+ /* Immutable state. */
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+
+ /* Mutable fields. Populated with initial state. */
+ pgoff_t pgoff;
+ struct file *file;
+ vm_flags_t vm_flags;
+ pgprot_t page_prot;
+
+ /* Write-only fields. */
+ const struct vm_operations_struct *vm_ops;
+ void *private_data;
+};
+
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -878,6 +909,9 @@ struct vm_area_struct {
struct anon_vma_name *anon_name;
#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef __HAVE_PFNMAP_TRACKING
+ struct pfnmap_track_ctx *pfnmap_track_ctx;
+#endif
} __randomize_layout;
#ifdef CONFIG_NUMA
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e0eddfd306ef..5da384bd0a26 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -1,6 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAP_LOCK_H
#define _LINUX_MMAP_LOCK_H
+/* Avoid a dependency loop by declaring here. */
+extern int rcuwait_wake_up(struct rcuwait *w);
+
#include <linux/lockdep.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
@@ -105,6 +109,206 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
return read_seqcount_retry(&mm->mm_lock_seq, seq);
}
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ static struct lock_class_key lockdep_key;
+
+ lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
+#endif
+ if (reset_refcnt)
+ refcount_set(&vma->vm_refcnt, 0);
+ vma->vm_lock_seq = UINT_MAX;
+}
+
+static inline bool is_vma_writer_only(int refcnt)
+{
+ /*
+ * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
+ * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
+ * a detached vma happens only in vma_mark_detached() and is a rare
+ * case, therefore most of the time there will be no unnecessary wakeup.
+ */
+ return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
+}
+
+static inline void vma_refcount_put(struct vm_area_struct *vma)
+{
+ /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
+ struct mm_struct *mm = vma->vm_mm;
+ int oldcnt;
+
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
+ if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
+
+ if (is_vma_writer_only(oldcnt - 1))
+ rcuwait_wake_up(&mm->vma_writer_wait);
+ }
+}
+
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ * False locked result is possible if mm_lock_seq overflows or if vma gets
+ * reused and attached to a different mm before we lock it.
+ * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
+ * detached.
+ */
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ int oldcnt;
+
+ /*
+ * Check before locking. A race might cause false locked result.
+ * We can use READ_ONCE() for the mm_lock_seq here, and don't need
+ * ACQUIRE semantics, because this is just a lockless check whose result
+ * we don't rely on for anything - the mm_lock_seq read against which we
+ * need ordering is below.
+ */
+ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
+ return NULL;
+
+ /*
+ * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
+ * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
+ * Acquire fence is required here to avoid reordering against later
+ * vm_lock_seq check and checks inside lock_vma_under_rcu().
+ */
+ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT))) {
+ /* return EAGAIN if vma got detached from under us */
+ return oldcnt ? NULL : ERR_PTR(-EAGAIN);
+ }
+
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+ /*
+ * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
+ * False unlocked result is impossible because we modify and check
+ * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
+ * modification invalidates all existing locks.
+ *
+ * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
+ * racing with vma_end_write_all(), we only start reading from the VMA
+ * after it has been unlocked.
+ * This pairs with RELEASE semantics in vma_end_write_all().
+ */
+ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
+ vma_refcount_put(vma);
+ return NULL;
+ }
+
+ return vma;
+}
+
+/*
+ * Use only while holding mmap read lock which guarantees that locking will not
+ * fail (nobody can concurrently write-lock the vma). vma_start_read() should
+ * not be used in such cases because it might fail due to mm_lock_seq overflow.
+ * This functionality is used to obtain vma read lock and drop the mmap read lock.
+ */
+static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
+{
+ int oldcnt;
+
+ mmap_assert_locked(vma->vm_mm);
+ if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
+ VMA_REF_LIMIT)))
+ return false;
+
+ rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+ return true;
+}
+
+/*
+ * Use only while holding mmap read lock which guarantees that locking will not
+ * fail (nobody can concurrently write-lock the vma). vma_start_read() should
+ * not be used in such cases because it might fail due to mm_lock_seq overflow.
+ * This functionality is used to obtain vma read lock and drop the mmap read lock.
+ */
+static inline bool vma_start_read_locked(struct vm_area_struct *vma)
+{
+ return vma_start_read_locked_nested(vma, 0);
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+ vma_refcount_put(vma);
+}
+
+/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
+static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+
+ /*
+ * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+ * mm->mm_lock_seq can't be concurrently modified.
+ */
+ *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
+ return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
+
+/*
+ * Begin writing to a VMA.
+ * Exclude concurrent readers under the per-VMA lock until the currently
+ * write-locked mmap_lock is dropped or downgraded.
+ */
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+ unsigned int mm_lock_seq;
+
+ if (__is_vma_write_locked(vma, &mm_lock_seq))
+ return;
+
+ __vma_start_write(vma, mm_lock_seq);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+ unsigned int mm_lock_seq;
+
+ VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+}
+
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+ unsigned int mm_lock_seq;
+
+ VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
+ !__is_vma_write_locked(vma, &mm_lock_seq), vma);
+}
+
+/*
+ * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
+ * assertions should be made either under mmap_write_lock or when the object
+ * has been isolated under mmap_write_lock, ensuring no competing writers.
+ */
+static inline void vma_assert_attached(struct vm_area_struct *vma)
+{
+ WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
+}
+
+static inline void vma_assert_detached(struct vm_area_struct *vma)
+{
+ WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
+}
+
+static inline void vma_mark_attached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_detached(vma);
+ refcount_set_release(&vma->vm_refcnt, 1);
+}
+
+void vma_mark_detached(struct vm_area_struct *vma);
+
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address);
+
#else /* CONFIG_PER_VMA_LOCK */
static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
@@ -120,6 +324,29 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int
{
return true;
}
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
+static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+ { return NULL; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+ { mmap_assert_write_locked(vma->vm_mm); }
+static inline void vma_assert_attached(struct vm_area_struct *vma) {}
+static inline void vma_assert_detached(struct vm_area_struct *vma) {}
+static inline void vma_mark_attached(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma) {}
+
+static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ return NULL;
+}
+
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+ mmap_assert_locked(vma->vm_mm);
+}
#endif /* CONFIG_PER_VMA_LOCK */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b1c459f7a485..28066b4ced81 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -2074,11 +2074,37 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
return usage ? test_bit(idx, usage->subsection_map) : 0;
}
+
+static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn)
+{
+ struct mem_section_usage *usage = READ_ONCE(ms->usage);
+ int idx = subsection_map_index(*pfn);
+ unsigned long bit;
+
+ if (!usage)
+ return false;
+
+ if (test_bit(idx, usage->subsection_map))
+ return true;
+
+ /* Find the next subsection that exists */
+ bit = find_next_bit(usage->subsection_map, SUBSECTIONS_PER_SECTION, idx);
+ if (bit == SUBSECTIONS_PER_SECTION)
+ return false;
+
+ *pfn = (*pfn & PAGE_SECTION_MASK) + (bit * PAGES_PER_SUBSECTION);
+ return true;
+}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
return 1;
}
+
+static inline bool pfn_section_first_valid(struct mem_section *ms, unsigned long *pfn)
+{
+ return true;
+}
#endif
void sparse_init_early_section(int nid, struct page *map, unsigned long pnum,
@@ -2127,6 +2153,58 @@ static inline int pfn_valid(unsigned long pfn)
return ret;
}
+
+/* Returns end_pfn or higher if no valid PFN remaining in range */
+static inline unsigned long first_valid_pfn(unsigned long pfn, unsigned long end_pfn)
+{
+ unsigned long nr = pfn_to_section_nr(pfn);
+
+ rcu_read_lock_sched();
+
+ while (nr <= __highest_present_section_nr && pfn < end_pfn) {
+ struct mem_section *ms = __pfn_to_section(pfn);
+
+ if (valid_section(ms) &&
+ (early_section(ms) || pfn_section_first_valid(ms, &pfn))) {
+ rcu_read_unlock_sched();
+ return pfn;
+ }
+
+ /* Nothing left in this section? Skip to next section */
+ nr++;
+ pfn = section_nr_to_pfn(nr);
+ }
+
+ rcu_read_unlock_sched();
+ return end_pfn;
+}
+
+static inline unsigned long next_valid_pfn(unsigned long pfn, unsigned long end_pfn)
+{
+ pfn++;
+
+ if (pfn >= end_pfn)
+ return end_pfn;
+
+ /*
+ * Either every PFN within the section (or subsection for VMEMMAP) is
+ * valid, or none of them are. So there's no point repeating the check
+ * for every PFN; only call first_valid_pfn() again when crossing a
+ * (sub)section boundary (i.e. !(pfn & ~PAGE_{SUB,}SECTION_MASK)).
+ */
+ if (pfn & ~(IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) ?
+ PAGE_SUBSECTION_MASK : PAGE_SECTION_MASK))
+ return pfn;
+
+ return first_valid_pfn(pfn, end_pfn);
+}
+
+
+#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \
+ for ((_pfn) = first_valid_pfn((_start_pfn), (_end_pfn)); \
+ (_pfn) < (_end_pfn); \
+ (_pfn) = next_valid_pfn((_pfn), (_end_pfn)))
+
#endif
static inline int pfn_in_present_section(unsigned long pfn)
@@ -2176,6 +2254,16 @@ void sparse_init(void);
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */
+/*
+ * Fallback case for when the architecture provides its own pfn_valid() but
+ * not a corresponding for_each_valid_pfn().
+ */
+#ifndef for_each_valid_pfn
+#define for_each_valid_pfn(_pfn, _start_pfn, _end_pfn) \
+ for ((_pfn) = (_start_pfn); (_pfn) < (_end_pfn); (_pfn)++) \
+ if (pfn_valid(_pfn))
+#endif
+
#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */
diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h
index dd85613cdd86..991076cba7c5 100644
--- a/include/linux/numa_memblks.h
+++ b/include/linux/numa_memblks.h
@@ -22,6 +22,7 @@ struct numa_meminfo {
};
int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+int __init numa_add_reserved_memblk(int nid, u64 start, u64 end);
void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index 4f5c9e979bb9..760006b1c480 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -72,8 +72,10 @@
#define NODE_NOT_IN_PAGE_FLAGS 1
#endif
-#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+#if defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_TAG_WIDTH 8
+#elif defined(CONFIG_KASAN_HW_TAGS)
+#define KASAN_TAG_WIDTH 4
#else
#define KASAN_TAG_WIDTH 0
#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3b814ce08331..4fe5ee67535b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -915,20 +915,6 @@ FOLIO_FLAG_FALSE(partially_mapped)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
- * PageHuge() only returns true for hugetlbfs pages, but not for
- * normal or transparent huge pages.
- *
- * PageTransHuge() returns true for both transparent huge and
- * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
- * called only in the core VM paths where hugetlbfs pages can't exist.
- */
-static inline int PageTransHuge(const struct page *page)
-{
- VM_BUG_ON_PAGE(PageTail(page), page);
- return PageHead(page);
-}
-
-/*
* PageTransCompound returns true for both transparent huge pages
* and hugetlbfs pages, so it should only be called when it's known
* that hugetlbfs pages aren't involved.
@@ -938,7 +924,6 @@ static inline int PageTransCompound(const struct page *page)
return PageCompound(page);
}
#else
-TESTPAGEFLAG_FALSE(TransHuge, transhuge)
TESTPAGEFLAG_FALSE(TransCompound, transcompound)
#endif
@@ -989,7 +974,7 @@ static inline bool page_mapcount_is_type(unsigned int mapcount)
static inline bool page_has_type(const struct page *page)
{
- return page_mapcount_is_type(data_race(page->page_type));
+ return page_type_has_type(data_race(page->page_type));
}
#define FOLIO_TYPE_OPS(lname, fname) \
@@ -1237,10 +1222,6 @@ static inline int folio_has_private(const struct folio *folio)
return !!(folio->flags & PAGE_FLAGS_PRIVATE);
}
-static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio)
-{
- return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
-}
#undef PF_ANY
#undef PF_HEAD
#undef PF_NO_TAIL
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 26baa78f1ca7..d2ced9920992 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -533,7 +533,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping)
}
struct address_space *folio_mapping(struct folio *);
-struct address_space *swapcache_mapping(struct folio *);
/**
* folio_flush_mapping - Find the file mapping this folio belongs to.
@@ -884,26 +883,6 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
-extern pgoff_t __folio_swap_cache_index(struct folio *folio);
-
-/**
- * folio_index - File index of a folio.
- * @folio: The folio.
- *
- * For a folio which is either in the page cache or the swap cache,
- * return its index within the address_space it belongs to. If you know
- * the page is definitely in the page cache, you can look at the folio's
- * index directly.
- *
- * Return: The index (offset in units of pages) of a folio in its file.
- */
-static inline pgoff_t folio_index(struct folio *folio)
-{
- if (unlikely(folio_test_swapcache(folio)))
- return __folio_swap_cache_index(folio);
- return folio->index;
-}
-
/**
* folio_next_index - Get the index of the next folio.
* @folio: The current folio.
@@ -935,27 +914,14 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
* @folio: The folio.
* @index: The page index within the file.
*
- * Context: The caller should have the page locked in order to prevent
- * (eg) shmem from moving the page between the page cache and swap cache
- * and changing its index in the middle of the operation.
+ * Context: The caller should have the folio locked and ensure
+ * e.g., shmem did not move this folio to the swap cache.
* Return: true or false.
*/
static inline bool folio_contains(struct folio *folio, pgoff_t index)
{
- return index - folio_index(folio) < folio_nr_pages(folio);
-}
-
-/*
- * Given the page we found in the page cache, return the page corresponding
- * to this index in the file
- */
-static inline struct page *find_subpage(struct page *head, pgoff_t index)
-{
- /* HugeTLBfs wants the head page regardless */
- if (PageHuge(head))
- return head;
-
- return head + (index & (thp_nr_pages(head) - 1));
+ VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
+ return index - folio->index < folio_nr_pages(folio);
}
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
@@ -1308,9 +1274,9 @@ static inline bool filemap_range_needs_writeback(struct address_space *mapping,
* struct readahead_control - Describes a readahead request.
*
* A readahead request is for consecutive pages. Filesystems which
- * implement the ->readahead method should call readahead_page() or
- * readahead_page_batch() in a loop and attempt to start I/O against
- * each page in the request.
+ * implement the ->readahead method should call readahead_folio() or
+ * __readahead_batch() in a loop and attempt to start reads into each
+ * folio in the request.
*
* Most of the fields in this struct are private and should be accessed
* by the functions below.
@@ -1416,22 +1382,6 @@ static inline struct folio *__readahead_folio(struct readahead_control *ractl)
}
/**
- * readahead_page - Get the next page to read.
- * @ractl: The current readahead request.
- *
- * Context: The page is locked and has an elevated refcount. The caller
- * should decreases the refcount once the page has been submitted for I/O
- * and unlock the page once all I/O to that page has completed.
- * Return: A pointer to the next page, or %NULL if we are done.
- */
-static inline struct page *readahead_page(struct readahead_control *ractl)
-{
- struct folio *folio = __readahead_folio(ractl);
-
- return &folio->page;
-}
-
-/**
* readahead_folio - Get the next folio to read.
* @ractl: The current readahead request.
*
@@ -1453,7 +1403,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
{
unsigned int i = 0;
XA_STATE(xas, &rac->mapping->i_pages, 0);
- struct page *page;
+ struct folio *folio;
BUG_ON(rac->_batch_count > rac->_nr_pages);
rac->_nr_pages -= rac->_batch_count;
@@ -1462,13 +1412,12 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
xas_set(&xas, rac->_index);
rcu_read_lock();
- xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, rac->_index + rac->_nr_pages - 1) {
+ if (xas_retry(&xas, folio))
continue;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageTail(page), page);
- array[i++] = page;
- rac->_batch_count += thp_nr_pages(page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ array[i++] = folio_page(folio, 0);
+ rac->_batch_count += folio_nr_pages(folio);
if (i == array_sz)
break;
}
@@ -1478,20 +1427,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
}
/**
- * readahead_page_batch - Get a batch of pages to read.
- * @rac: The current readahead request.
- * @array: An array of pointers to struct page.
- *
- * Context: The pages are locked and have an elevated refcount. The caller
- * should decreases the refcount once the page has been submitted for I/O
- * and unlock the page once all I/O to that page has completed.
- * Return: The number of pages placed in the array. 0 indicates the request
- * is complete.
- */
-#define readahead_page_batch(rac, array) \
- __readahead_batch(rac, array, ARRAY_SIZE(array))
-
-/**
* readahead_pos - The byte offset into the file of this readahead request.
* @rac: The readahead request.
*/
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 0aeb0e276a3e..c16cdeaa505e 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -375,7 +375,7 @@ do { \
} while (0)
/*
- * this_cpu operations (C) 2008-2013 Christoph Lameter <cl@linux.com>
+ * this_cpu operations (C) 2008-2013 Christoph Lameter <cl@gentwo.org>
*
* Optimized manipulation for memory allocated through the per cpu
* allocator or for addresses of per cpu variables.
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index b50447ef1c92..0b6e1f781d86 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1164,10 +1164,6 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
}
#endif
-#ifndef __HAVE_ARCH_PGD_OFFSET_GATE
-#define pgd_offset_gate(mm, addr) pgd_offset(mm, addr)
-#endif
-
#ifndef __HAVE_ARCH_MOVE_PTE
#define move_pte(pte, old_addr, new_addr) (pte)
#endif
@@ -1489,83 +1485,92 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
* vmf_insert_pfn.
*/
-/*
- * track_pfn_remap is called when a _new_ pfn mapping is being established
- * by remap_pfn_range() for physical range indicated by pfn and size.
- */
-static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
- unsigned long pfn, unsigned long addr,
- unsigned long size)
+static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size,
+ pgprot_t *prot)
{
return 0;
}
-/*
- * track_pfn_insert is called when a _new_ single pfn is established
- * by vmf_insert_pfn().
- */
-static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
- pfn_t pfn)
+static inline int pfnmap_track(unsigned long pfn, unsigned long size,
+ pgprot_t *prot)
{
+ return 0;
}
-/*
- * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page
- * tables copied during copy_page_range(). Will store the pfn to be
- * passed to untrack_pfn_copy() only if there is something to be untracked.
- * Callers should initialize the pfn to 0.
- */
-static inline int track_pfn_copy(struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma, unsigned long *pfn)
+static inline void pfnmap_untrack(unsigned long pfn, unsigned long size)
{
- return 0;
}
+#else
+/**
+ * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range
+ * @pfn: the start of the pfn range
+ * @size: the size of the pfn range in bytes
+ * @prot: the pgprot to modify
+ *
+ * Lookup the cachemode for the pfn range starting at @pfn with the size
+ * @size and store it in @prot, leaving other data in @prot unchanged.
+ *
+ * This allows for a hardware implementation to have fine-grained control of
+ * memory cache behavior at page level granularity. Without a hardware
+ * implementation, this function does nothing.
+ *
+ * Currently there is only one implementation for this - x86 Page Attribute
+ * Table (PAT). See Documentation/arch/x86/pat.rst for more details.
+ *
+ * This function can fail if the pfn range spans pfns that require differing
+ * cachemodes. If the pfn range was previously verified to have a single
+ * cachemode, it is sufficient to query only a single pfn. The assumption is
+ * that this is the case for drivers using the vmf_insert_pfn*() interface.
+ *
+ * Returns 0 on success and -EINVAL on error.
+ */
+int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size,
+ pgprot_t *prot);
-/*
- * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during
- * copy_page_range(), but after track_pfn_copy() was already called. Can
- * be called even if track_pfn_copy() did not actually track anything:
- * handled internally.
+/**
+ * pfnmap_track - track a pfn range
+ * @pfn: the start of the pfn range
+ * @size: the size of the pfn range in bytes
+ * @prot: the pgprot to track
+ *
+ * Requested the pfn range to be 'tracked' by a hardware implementation and
+ * setup the cachemode in @prot similar to pfnmap_setup_cachemode().
+ *
+ * This allows for fine-grained control of memory cache behaviour at page
+ * level granularity. Tracking memory this way is persisted across VMA splits
+ * (VMA merging does not apply for VM_PFNMAP).
+ *
+ * Currently, there is only one implementation for this - x86 Page Attribute
+ * Table (PAT). See Documentation/arch/x86/pat.rst for more details.
+ *
+ * Returns 0 on success and -EINVAL on error.
*/
-static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma,
- unsigned long pfn)
-{
-}
+int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot);
-/*
- * untrack_pfn is called while unmapping a pfnmap for a region.
- * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case pfn, size are zero).
+/**
+ * pfnmap_untrack - untrack a pfn range
+ * @pfn: the start of the pfn range
+ * @size: the size of the pfn range in bytes
+ *
+ * Untrack a pfn range previously tracked through pfnmap_track().
*/
-static inline void untrack_pfn(struct vm_area_struct *vma,
- unsigned long pfn, unsigned long size,
- bool mm_wr_locked)
-{
-}
+void pfnmap_untrack(unsigned long pfn, unsigned long size);
+#endif
-/*
- * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA:
+/**
+ * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn
+ * @pfn: the pfn
+ * @prot: the pgprot to modify
+ *
+ * Lookup the cachemode for @pfn and store it in @prot, leaving other
+ * data in @prot unchanged.
*
- * 1) During mremap() on the src VMA after the page tables were moved.
- * 2) During fork() on the dst VMA, immediately after duplicating the src VMA.
+ * See pfnmap_setup_cachemode() for details.
*/
-static inline void untrack_pfn_clear(struct vm_area_struct *vma)
+static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot)
{
+ pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot);
}
-#else
-extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
- unsigned long pfn, unsigned long addr,
- unsigned long size);
-extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
- pfn_t pfn);
-extern int track_pfn_copy(struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma, unsigned long *pfn);
-extern void untrack_pfn_copy(struct vm_area_struct *dst_vma,
- unsigned long pfn);
-extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
- unsigned long size, bool mm_wr_locked);
-extern void untrack_pfn_clear(struct vm_area_struct *vma);
-#endif
#ifdef CONFIG_MMU
#ifdef __HAVE_COLOR_ZERO_PAGE
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index 8dbd51ea8626..240bd3bff18d 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -11,10 +11,17 @@ struct ptdump_range {
};
struct ptdump_state {
- /* level is 0:PGD to 4:PTE, or -1 if unknown */
- void (*note_page)(struct ptdump_state *st, unsigned long addr,
- int level, u64 val);
- void (*effective_prot)(struct ptdump_state *st, int level, u64 val);
+ void (*note_page_pte)(struct ptdump_state *st, unsigned long addr, pte_t pte);
+ void (*note_page_pmd)(struct ptdump_state *st, unsigned long addr, pmd_t pmd);
+ void (*note_page_pud)(struct ptdump_state *st, unsigned long addr, pud_t pud);
+ void (*note_page_p4d)(struct ptdump_state *st, unsigned long addr, p4d_t p4d);
+ void (*note_page_pgd)(struct ptdump_state *st, unsigned long addr, pgd_t pgd);
+ void (*note_page_flush)(struct ptdump_state *st);
+ void (*effective_prot_pte)(struct ptdump_state *st, pte_t pte);
+ void (*effective_prot_pmd)(struct ptdump_state *st, pmd_t pmd);
+ void (*effective_prot_pud)(struct ptdump_state *st, pud_t pud);
+ void (*effective_prot_p4d)(struct ptdump_state *st, p4d_t p4d);
+ void (*effective_prot_pgd)(struct ptdump_state *st, pgd_t pgd);
const struct ptdump_range *range;
};
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6b82b618846e..c4f4903b1088 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -223,7 +223,7 @@ static inline void __folio_large_mapcount_sanity_checks(const struct folio *foli
VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY &&
folio->_mm_id_mapcount[1] < 0);
VM_WARN_ON_ONCE(!folio_mapped(folio) &&
- folio_test_large_maybe_mapped_shared(folio));
+ test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids));
}
static __always_inline void folio_set_large_mapcount(struct folio *folio,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index db46b25a65ae..bc0e1c275fc0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -414,6 +414,10 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
#define MIN_SWAPPINESS 0
#define MAX_SWAPPINESS 200
+
+/* Just recliam from anon folios in proactive memory reclaim */
+#define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1)
+
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
@@ -450,7 +454,7 @@ static inline unsigned long total_swapcache_pages(void)
}
void free_swap_cache(struct folio *folio);
-void free_page_and_swap_cache(struct page *);
+void free_folio_and_swap_cache(struct folio *folio);
void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
@@ -520,10 +524,8 @@ static inline void put_swap_device(struct swap_info_struct *si)
#define si_swapinfo(val) \
do { (val)->freeswap = (val)->totalswap = 0; } while (0)
-/* only sparc can not include linux/pagemap.h in this file
- * so leave put_page and release_pages undeclared... */
-#define free_page_and_swap_cache(page) \
- put_page(page)
+#define free_folio_and_swap_cache(folio) \
+ folio_put(folio)
#define free_pages_and_swap_cache(pages, nr) \
release_pages((pages), (nr));
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 2e46b69ff0a6..516217c39094 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -188,13 +188,13 @@ struct uprobes_state {
};
extern void __init uprobes_init(void);
-extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
-extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
+extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
+extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
extern bool is_swbp_insn(uprobe_opcode_t *insn);
extern bool is_trap_insn(uprobe_opcode_t *insn);
extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
-extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
+extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t);
extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h
index 3b570b765b75..76ca2b83c13e 100644
--- a/include/linux/util_macros.h
+++ b/include/linux/util_macros.h
@@ -2,7 +2,10 @@
#ifndef _LINUX_HELPER_MACROS_H_
#define _LINUX_HELPER_MACROS_H_
+#include <linux/compiler_attributes.h>
#include <linux/math.h>
+#include <linux/typecheck.h>
+#include <linux/stddef.h>
/**
* for_each_if - helper for handling conditionals in various for_each macros
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index 78eede109b1a..be850174e802 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -965,10 +965,12 @@ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
* Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
* in xa_init_flags().
*
+ * Note that callers interested in whether wrapping has occurred should
+ * use __xa_alloc_cyclic() instead.
+ *
* Context: Any context. Takes and releases the xa_lock. May sleep if
* the @gfp flags permit.
- * Return: 0 if the allocation succeeded without wrapping. 1 if the
- * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be
* allocated or -EBUSY if there are no free entries in @limit.
*/
static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
@@ -981,7 +983,7 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
xa_unlock(xa);
- return err;
+ return err < 0 ? err : 0;
}
/**
@@ -1002,10 +1004,12 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
* Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
* in xa_init_flags().
*
+ * Note that callers interested in whether wrapping has occurred should
+ * use __xa_alloc_cyclic() instead.
+ *
* Context: Any context. Takes and releases the xa_lock while
* disabling softirqs. May sleep if the @gfp flags permit.
- * Return: 0 if the allocation succeeded without wrapping. 1 if the
- * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be
* allocated or -EBUSY if there are no free entries in @limit.
*/
static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
@@ -1018,7 +1022,7 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
xa_unlock_bh(xa);
- return err;
+ return err < 0 ? err : 0;
}
/**
@@ -1039,10 +1043,12 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
* Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
* in xa_init_flags().
*
+ * Note that callers interested in whether wrapping has occurred should
+ * use __xa_alloc_cyclic() instead.
+ *
* Context: Process context. Takes and releases the xa_lock while
* disabling interrupts. May sleep if the @gfp flags permit.
- * Return: 0 if the allocation succeeded without wrapping. 1 if the
- * allocation succeeded after wrapping, -ENOMEM if memory could not be
+ * Return: 0 if the allocation succeeded, -ENOMEM if memory could not be
* allocated or -EBUSY if there are no free entries in @limit.
*/
static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
@@ -1055,7 +1061,7 @@ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
xa_unlock_irq(xa);
- return err;
+ return err < 0 ? err : 0;
}
/**
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 52f30e526607..369ef068fad8 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -22,7 +22,7 @@ const char *zpool_get_type(struct zpool *pool);
void zpool_destroy_pool(struct zpool *pool);
int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
- unsigned long *handle);
+ unsigned long *handle, const int nid);
void zpool_free(struct zpool *pool, unsigned long handle);
@@ -64,7 +64,7 @@ struct zpool_driver {
void (*destroy)(void *pool);
int (*malloc)(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle);
+ unsigned long *handle, const int nid);
void (*free)(void *pool, unsigned long handle);
void *(*obj_read_begin)(void *pool, unsigned long handle,
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index c26baf9fb331..13e9cc5490f7 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -26,7 +26,8 @@ struct zs_pool;
struct zs_pool *zs_create_pool(const char *name);
void zs_destroy_pool(struct zs_pool *pool);
-unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags,
+ const int nid);
void zs_free(struct zs_pool *pool, unsigned long obj);
size_t zs_huge_class_size(struct zs_pool *pool);