summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/buildid.h2
-rw-r--r--include/linux/cacheinfo.h6
-rw-r--r--include/linux/cma.h6
-rw-r--r--include/linux/crash_core.h152
-rw-r--r--include/linux/crash_reserve.h48
-rw-r--r--include/linux/damon.h89
-rw-r--r--include/linux/dax.h17
-rw-r--r--include/linux/efi.h5
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/gfp_types.h90
-rw-r--r--include/linux/highmem.h14
-rw-r--r--include/linux/huge_mm.h21
-rw-r--r--include/linux/hugetlb.h2
-rw-r--r--include/linux/kexec.h47
-rw-r--r--include/linux/list_lru.h20
-rw-r--r--include/linux/memcontrol.h31
-rw-r--r--include/linux/memory.h9
-rw-r--r--include/linux/memory_hotplug.h24
-rw-r--r--include/linux/memremap.h3
-rw-r--r--include/linux/mm.h56
-rw-r--r--include/linux/mm_types.h42
-rw-r--r--include/linux/mmdebug.h2
-rw-r--r--include/linux/mmu_context.h2
-rw-r--r--include/linux/mmzone.h9
-rw-r--r--include/linux/moduleloader.h8
-rw-r--r--include/linux/padata.h14
-rw-r--r--include/linux/page-flags.h153
-rw-r--r--include/linux/page_counter.h2
-rw-r--r--include/linux/page_owner.h14
-rw-r--r--include/linux/pagevec.h18
-rw-r--r--include/linux/pgtable.h143
-rw-r--r--include/linux/ptdump.h10
-rw-r--r--include/linux/sched.h10
-rw-r--r--include/linux/sched/mm.h55
-rw-r--r--include/linux/stackdepot.h58
-rw-r--r--include/linux/swap.h8
-rw-r--r--include/linux/swapops.h13
-rw-r--r--include/linux/userfaultfd_k.h75
-rw-r--r--include/linux/vmalloc.h1
-rw-r--r--include/linux/vmcore_info.h81
-rw-r--r--include/linux/writeback.h12
-rw-r--r--include/linux/zswap.h11
42 files changed, 976 insertions, 409 deletions
diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 8a582d242f06..20aa3c2d89f7 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -11,7 +11,7 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
__u32 *size);
int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
-#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_CRASH_CORE)
+#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)
extern unsigned char vmlinux_build_id[BUILD_ID_SIZE_MAX];
void init_vmlinux_build_id(void);
#else
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index d504eb4b49ab..2cb15fe4fe12 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -138,4 +138,10 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level)
#define use_arch_cache_info() (false)
#endif
+#ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING
+#define cpu_dcache_is_aliasing() false
+#else
+#include <asm/cachetype.h>
+#endif
+
#endif /* _LINUX_CACHEINFO_H */
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 63873b93deaa..9db877506ea8 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -6,12 +6,8 @@
#include <linux/types.h>
#include <linux/numa.h>
-/*
- * There is always at least global CMA area and a few optional
- * areas configured in kernel .config.
- */
#ifdef CONFIG_CMA_AREAS
-#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS)
+#define MAX_CMA_AREAS CONFIG_CMA_AREAS
#endif
#define CMA_MAX_NAME 64
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 9eaeaafe0cad..23270b16e1db 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -5,118 +5,47 @@
#include <linux/linkage.h>
#include <linux/elfcore.h>
#include <linux/elf.h>
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-#include <asm/crash_core.h>
-#endif
-/* Location of a reserved region to hold the crash kernel.
- */
-extern struct resource crashk_res;
-extern struct resource crashk_low_res;
+struct kimage;
+
+#ifdef CONFIG_CRASH_DUMP
-#define CRASH_CORE_NOTE_NAME "CORE"
-#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
-#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
-#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
+int crash_shrink_memory(unsigned long new_size);
+ssize_t crash_get_memory_size(void);
+#ifndef arch_kexec_protect_crashkres
/*
- * The per-cpu notes area is a list of notes terminated by a "NULL"
- * note header. For kdump, the code in vmcore.c runs in the context
- * of the second kernel to combine them into one note.
+ * Protection mechanism for crashkernel reserved memory after
+ * the kdump kernel is loaded.
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
*/
-#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \
- CRASH_CORE_NOTE_NAME_BYTES + \
- CRASH_CORE_NOTE_DESC_BYTES)
-
-#define VMCOREINFO_BYTES PAGE_SIZE
-#define VMCOREINFO_NOTE_NAME "VMCOREINFO"
-#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
-#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \
- VMCOREINFO_NOTE_NAME_BYTES + \
- VMCOREINFO_BYTES)
-
-typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
-/* Per cpu memory for storing cpu states in case of system crash. */
-extern note_buf_t __percpu *crash_notes;
-
-void crash_update_vmcoreinfo_safecopy(void *ptr);
-void crash_save_vmcoreinfo(void);
-void arch_crash_save_vmcoreinfo(void);
-__printf(1, 2)
-void vmcoreinfo_append_str(const char *fmt, ...);
-phys_addr_t paddr_vmcoreinfo_note(void);
-
-#define VMCOREINFO_OSRELEASE(value) \
- vmcoreinfo_append_str("OSRELEASE=%s\n", value)
-#define VMCOREINFO_BUILD_ID() \
- ({ \
- static_assert(sizeof(vmlinux_build_id) == 20); \
- vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \
- })
-
-#define VMCOREINFO_PAGESIZE(value) \
- vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
-#define VMCOREINFO_SYMBOL(name) \
- vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
-#define VMCOREINFO_SYMBOL_ARRAY(name) \
- vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name)
-#define VMCOREINFO_SIZE(name) \
- vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
- (unsigned long)sizeof(name))
-#define VMCOREINFO_STRUCT_SIZE(name) \
- vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
- (unsigned long)sizeof(struct name))
-#define VMCOREINFO_OFFSET(name, field) \
- vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
- (unsigned long)offsetof(struct name, field))
-#define VMCOREINFO_TYPE_OFFSET(name, field) \
- vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
- (unsigned long)offsetof(name, field))
-#define VMCOREINFO_LENGTH(name, value) \
- vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
-#define VMCOREINFO_NUMBER(name) \
- vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
-#define VMCOREINFO_CONFIG(name) \
- vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
-
-extern unsigned char *vmcoreinfo_data;
-extern size_t vmcoreinfo_size;
-extern u32 *vmcoreinfo_note;
-
-Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
- void *data, size_t data_len);
-void final_note(Elf_Word *buf);
-
-int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
- unsigned long long *crash_size, unsigned long long *crash_base,
- unsigned long long *low_size, bool *high);
-
-#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
-#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
-#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
+static inline void arch_kexec_protect_crashkres(void) { }
#endif
-#ifndef CRASH_ALIGN
-#define CRASH_ALIGN SZ_2M
+
+#ifndef arch_kexec_unprotect_crashkres
+static inline void arch_kexec_unprotect_crashkres(void) { }
#endif
-#ifndef CRASH_ADDR_LOW_MAX
-#define CRASH_ADDR_LOW_MAX SZ_4G
+
+
+
+#ifndef arch_crash_handle_hotplug_event
+static inline void arch_crash_handle_hotplug_event(struct kimage *image) { }
#endif
-#ifndef CRASH_ADDR_HIGH_MAX
-#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM()
+
+int crash_check_update_elfcorehdr(void);
+
+#ifndef crash_hotplug_cpu_support
+static inline int crash_hotplug_cpu_support(void) { return 0; }
+#endif
+
+#ifndef crash_hotplug_memory_support
+static inline int crash_hotplug_memory_support(void) { return 0; }
#endif
-void __init reserve_crashkernel_generic(char *cmdline,
- unsigned long long crash_size,
- unsigned long long crash_base,
- unsigned long long crash_low_size,
- bool high);
-#else
-static inline void __init reserve_crashkernel_generic(char *cmdline,
- unsigned long long crash_size,
- unsigned long long crash_base,
- unsigned long long crash_low_size,
- bool high)
-{}
+#ifndef crash_get_elfcorehdr_size
+static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
#endif
/* Alignment required for elf header segment */
@@ -144,4 +73,23 @@ struct kexec_segment;
#define KEXEC_CRASH_HP_REMOVE_MEMORY 4
#define KEXEC_CRASH_HP_INVALID_CPU -1U
+extern void __crash_kexec(struct pt_regs *regs);
+extern void crash_kexec(struct pt_regs *regs);
+int kexec_should_crash(struct task_struct *p);
+int kexec_crash_loaded(void);
+void crash_save_cpu(struct pt_regs *regs, int cpu);
+extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);
+
+#else /* !CONFIG_CRASH_DUMP*/
+struct pt_regs;
+struct task_struct;
+struct kimage;
+static inline void __crash_kexec(struct pt_regs *regs) { }
+static inline void crash_kexec(struct pt_regs *regs) { }
+static inline int kexec_should_crash(struct task_struct *p) { return 0; }
+static inline int kexec_crash_loaded(void) { return 0; }
+static inline void crash_save_cpu(struct pt_regs *regs, int cpu) {};
+static inline int kimage_crash_copy_vmcoreinfo(struct kimage *image) { return 0; };
+#endif /* CONFIG_CRASH_DUMP*/
+
#endif /* LINUX_CRASH_CORE_H */
diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
new file mode 100644
index 000000000000..5a9df944fb80
--- /dev/null
+++ b/include/linux/crash_reserve.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_CRASH_RESERVE_H
+#define LINUX_CRASH_RESERVE_H
+
+#include <linux/linkage.h>
+#include <linux/elfcore.h>
+#include <linux/elf.h>
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#include <asm/crash_reserve.h>
+#endif
+
+/* Location of a reserved region to hold the crash kernel.
+ */
+extern struct resource crashk_res;
+extern struct resource crashk_low_res;
+
+int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
+ unsigned long long *crash_size, unsigned long long *crash_base,
+ unsigned long long *low_size, bool *high);
+
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
+#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
+#endif
+#ifndef CRASH_ALIGN
+#define CRASH_ALIGN SZ_2M
+#endif
+#ifndef CRASH_ADDR_LOW_MAX
+#define CRASH_ADDR_LOW_MAX SZ_4G
+#endif
+#ifndef CRASH_ADDR_HIGH_MAX
+#define CRASH_ADDR_HIGH_MAX memblock_end_of_DRAM()
+#endif
+
+void __init reserve_crashkernel_generic(char *cmdline,
+ unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high);
+#else
+static inline void __init reserve_crashkernel_generic(char *cmdline,
+ unsigned long long crash_size,
+ unsigned long long crash_base,
+ unsigned long long crash_low_size,
+ bool high)
+{}
+#endif
+#endif /* LINUX_CRASH_RESERVE_H */
diff --git a/include/linux/damon.h b/include/linux/damon.h
index 5881e4ac30be..886d07294f4e 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -127,18 +127,60 @@ enum damos_action {
};
/**
+ * enum damos_quota_goal_metric - Represents the metric to be used as the goal
+ *
+ * @DAMOS_QUOTA_USER_INPUT: User-input value.
+ * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us.
+ * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics.
+ *
+ * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
+ */
+enum damos_quota_goal_metric {
+ DAMOS_QUOTA_USER_INPUT,
+ DAMOS_QUOTA_SOME_MEM_PSI_US,
+ NR_DAMOS_QUOTA_GOAL_METRICS,
+};
+
+/**
+ * struct damos_quota_goal - DAMOS scheme quota auto-tuning goal.
+ * @metric: Metric to be used for representing the goal.
+ * @target_value: Target value of @metric to achieve with the tuning.
+ * @current_value: Current value of @metric.
+ * @last_psi_total: Last measured total PSI
+ * @list: List head for siblings.
+ *
+ * Data structure for getting the current score of the quota tuning goal. The
+ * score is calculated by how close @current_value and @target_value are. Then
+ * the score is entered to DAMON's internal feedback loop mechanism to get the
+ * auto-tuned quota.
+ *
+ * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually
+ * entered by the user, probably inside the kdamond callbacks. Otherwise,
+ * DAMON sets @current_value with self-measured value of @metric.
+ */
+struct damos_quota_goal {
+ enum damos_quota_goal_metric metric;
+ unsigned long target_value;
+ unsigned long current_value;
+ /* metric-dependent fields */
+ union {
+ u64 last_psi_total;
+ };
+ struct list_head list;
+};
+
+/**
* struct damos_quota - Controls the aggressiveness of the given scheme.
+ * @reset_interval: Charge reset interval in milliseconds.
* @ms: Maximum milliseconds that the scheme can use.
* @sz: Maximum bytes of memory that the action can be applied.
- * @reset_interval: Charge reset interval in milliseconds.
+ * @goals: Head of quota tuning goals (&damos_quota_goal) list.
+ * @esz: Effective size quota in bytes.
*
* @weight_sz: Weight of the region's size for prioritization.
* @weight_nr_accesses: Weight of the region's nr_accesses for prioritization.
* @weight_age: Weight of the region's age for prioritization.
*
- * @get_score: Feedback function for self-tuning quota.
- * @get_score_arg: Parameter for @get_score
- *
* To avoid consuming too much CPU time or IO resources for applying the
* &struct damos->action to large memory, DAMON allows users to set time and/or
* size quotas. The quotas can be set by writing non-zero values to &ms and
@@ -151,42 +193,35 @@ enum damos_action {
* throughput of the scheme's action. DAMON then compares it against &sz and
* uses smaller one as the effective quota.
*
+ * If @goals is not empt, DAMON calculates yet another size quota based on the
+ * goals using its internal feedback loop algorithm, for every @reset_interval.
+ * Then, if the new size quota is smaller than the effective quota, it uses the
+ * new size quota as the effective quota.
+ *
+ * The resulting effective size quota in bytes is set to @esz.
+ *
* For selecting regions within the quota, DAMON prioritizes current scheme's
* target memory regions using the &struct damon_operations->get_scheme_score.
* You could customize the prioritization logic by setting &weight_sz,
* &weight_nr_accesses, and &weight_age, because monitoring operations are
* encouraged to respect those.
- *
- * If @get_score function pointer is set, DAMON calls it back with
- * @get_score_arg and get the return value of it for every @reset_interval.
- * Then, DAMON adjusts the effective quota using the return value as a feedback
- * score to the current quota, using its internal feedback loop algorithm.
- *
- * The feedback loop algorithem assumes the quota input and the feedback score
- * output are in a positive proportional relationship, and the goal of the
- * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are
- * set together, those work as a hard limit quota. If neither @ms nor @sz are
- * set, the mechanism starts from the quota of one byte.
*/
struct damos_quota {
+ unsigned long reset_interval;
unsigned long ms;
unsigned long sz;
- unsigned long reset_interval;
+ struct list_head goals;
+ unsigned long esz;
unsigned int weight_sz;
unsigned int weight_nr_accesses;
unsigned int weight_age;
- unsigned long (*get_score)(void *arg);
- void *get_score_arg;
-
/* private: */
/* For throughput estimation */
unsigned long total_charged_sz;
unsigned long total_charged_ns;
- unsigned long esz; /* Effective size quota in bytes */
-
/* For charging the quota */
unsigned long charged_sz;
unsigned long charged_from;
@@ -640,6 +675,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
#define damon_for_each_scheme_safe(s, next, ctx) \
list_for_each_entry_safe(s, next, &(ctx)->schemes, list)
+#define damos_for_each_quota_goal(goal, quota) \
+ list_for_each_entry(goal, &quota->goals, list)
+
+#define damos_for_each_quota_goal_safe(goal, next, quota) \
+ list_for_each_entry_safe(goal, next, &(quota)->goals, list)
+
#define damos_for_each_filter(f, scheme) \
list_for_each_entry(f, &(scheme)->filters, list)
@@ -673,6 +714,12 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type,
void damos_add_filter(struct damos *s, struct damos_filter *f);
void damos_destroy_filter(struct damos_filter *f);
+struct damos_quota_goal *damos_new_quota_goal(
+ enum damos_quota_goal_metric metric,
+ unsigned long target_value);
+void damos_add_quota_goal(struct damos_quota *q, struct damos_quota_goal *g);
+void damos_destroy_quota_goal(struct damos_quota_goal *goal);
+
struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
enum damos_action action,
unsigned long apply_interval_us,
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b463502b16e1..9d3e3327af4c 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -63,6 +63,8 @@ void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
bool dax_write_cache_enabled(struct dax_device *dax_dev);
bool dax_synchronous(struct dax_device *dax_dev);
+void set_dax_nocache(struct dax_device *dax_dev);
+void set_dax_nomc(struct dax_device *dax_dev);
void set_dax_synchronous(struct dax_device *dax_dev);
size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i);
@@ -86,11 +88,7 @@ static inline void *dax_holder(struct dax_device *dax_dev)
static inline struct dax_device *alloc_dax(void *private,
const struct dax_operations *ops)
{
- /*
- * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
- * NULL is an error or expected.
- */
- return NULL;
+ return ERR_PTR(-EOPNOTSUPP);
}
static inline void put_dax(struct dax_device *dax_dev)
{
@@ -109,6 +107,12 @@ static inline bool dax_synchronous(struct dax_device *dax_dev)
{
return true;
}
+static inline void set_dax_nocache(struct dax_device *dax_dev)
+{
+}
+static inline void set_dax_nomc(struct dax_device *dax_dev)
+{
+}
static inline void set_dax_synchronous(struct dax_device *dax_dev)
{
}
@@ -124,9 +128,6 @@ static inline size_t dax_recovery_write(struct dax_device *dax_dev,
}
#endif
-void set_dax_nocache(struct dax_device *dax_dev);
-void set_dax_nomc(struct dax_device *dax_dev);
-
struct writeback_control;
#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index f0d56f106b60..d59b0947fba0 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -694,6 +694,11 @@ extern struct efi {
extern struct mm_struct efi_mm;
+static inline bool mm_is_efi(struct mm_struct *mm)
+{
+ return IS_ENABLED(CONFIG_EFI) && mm == &efi_mm;
+}
+
static inline int
efi_guidcmp (efi_guid_t left, efi_guid_t right)
{
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 937c2a9b6e54..c775ea3c6015 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -342,7 +342,7 @@ void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
void page_alloc_init_late(void);
-void setup_pcp_cacheinfo(void);
+void setup_pcp_cacheinfo(unsigned int cpu);
/*
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 1b6053da8754..868c8fb1bbc1 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -21,44 +21,78 @@ typedef unsigned int __bitwise gfp_t;
* include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
*/
+enum {
+ ___GFP_DMA_BIT,
+ ___GFP_HIGHMEM_BIT,
+ ___GFP_DMA32_BIT,
+ ___GFP_MOVABLE_BIT,
+ ___GFP_RECLAIMABLE_BIT,
+ ___GFP_HIGH_BIT,
+ ___GFP_IO_BIT,
+ ___GFP_FS_BIT,
+ ___GFP_ZERO_BIT,
+ ___GFP_UNUSED_BIT, /* 0x200u unused */
+ ___GFP_DIRECT_RECLAIM_BIT,
+ ___GFP_KSWAPD_RECLAIM_BIT,
+ ___GFP_WRITE_BIT,
+ ___GFP_NOWARN_BIT,
+ ___GFP_RETRY_MAYFAIL_BIT,
+ ___GFP_NOFAIL_BIT,
+ ___GFP_NORETRY_BIT,
+ ___GFP_MEMALLOC_BIT,
+ ___GFP_COMP_BIT,
+ ___GFP_NOMEMALLOC_BIT,
+ ___GFP_HARDWALL_BIT,
+ ___GFP_THISNODE_BIT,
+ ___GFP_ACCOUNT_BIT,
+ ___GFP_ZEROTAGS_BIT,
+#ifdef CONFIG_KASAN_HW_TAGS
+ ___GFP_SKIP_ZERO_BIT,
+ ___GFP_SKIP_KASAN_BIT,
+#endif
+#ifdef CONFIG_LOCKDEP
+ ___GFP_NOLOCKDEP_BIT,
+#endif
+ ___GFP_LAST_BIT
+};
+
/* Plain integer GFP bitmasks. Do not use this directly. */
-#define ___GFP_DMA 0x01u
-#define ___GFP_HIGHMEM 0x02u
-#define ___GFP_DMA32 0x04u
-#define ___GFP_MOVABLE 0x08u
-#define ___GFP_RECLAIMABLE 0x10u
-#define ___GFP_HIGH 0x20u
-#define ___GFP_IO 0x40u
-#define ___GFP_FS 0x80u
-#define ___GFP_ZERO 0x100u
+#define ___GFP_DMA BIT(___GFP_DMA_BIT)
+#define ___GFP_HIGHMEM BIT(___GFP_HIGHMEM_BIT)
+#define ___GFP_DMA32 BIT(___GFP_DMA32_BIT)
+#define ___GFP_MOVABLE BIT(___GFP_MOVABLE_BIT)
+#define ___GFP_RECLAIMABLE BIT(___GFP_RECLAIMABLE_BIT)
+#define ___GFP_HIGH BIT(___GFP_HIGH_BIT)
+#define ___GFP_IO BIT(___GFP_IO_BIT)
+#define ___GFP_FS BIT(___GFP_FS_BIT)
+#define ___GFP_ZERO BIT(___GFP_ZERO_BIT)
/* 0x200u unused */
-#define ___GFP_DIRECT_RECLAIM 0x400u
-#define ___GFP_KSWAPD_RECLAIM 0x800u
-#define ___GFP_WRITE 0x1000u
-#define ___GFP_NOWARN 0x2000u
-#define ___GFP_RETRY_MAYFAIL 0x4000u
-#define ___GFP_NOFAIL 0x8000u
-#define ___GFP_NORETRY 0x10000u
-#define ___GFP_MEMALLOC 0x20000u
-#define ___GFP_COMP 0x40000u
-#define ___GFP_NOMEMALLOC 0x80000u
-#define ___GFP_HARDWALL 0x100000u
-#define ___GFP_THISNODE 0x200000u
-#define ___GFP_ACCOUNT 0x400000u
-#define ___GFP_ZEROTAGS 0x800000u
+#define ___GFP_DIRECT_RECLAIM BIT(___GFP_DIRECT_RECLAIM_BIT)
+#define ___GFP_KSWAPD_RECLAIM BIT(___GFP_KSWAPD_RECLAIM_BIT)
+#define ___GFP_WRITE BIT(___GFP_WRITE_BIT)
+#define ___GFP_NOWARN BIT(___GFP_NOWARN_BIT)
+#define ___GFP_RETRY_MAYFAIL BIT(___GFP_RETRY_MAYFAIL_BIT)
+#define ___GFP_NOFAIL BIT(___GFP_NOFAIL_BIT)
+#define ___GFP_NORETRY BIT(___GFP_NORETRY_BIT)
+#define ___GFP_MEMALLOC BIT(___GFP_MEMALLOC_BIT)
+#define ___GFP_COMP BIT(___GFP_COMP_BIT)
+#define ___GFP_NOMEMALLOC BIT(___GFP_NOMEMALLOC_BIT)
+#define ___GFP_HARDWALL BIT(___GFP_HARDWALL_BIT)
+#define ___GFP_THISNODE BIT(___GFP_THISNODE_BIT)
+#define ___GFP_ACCOUNT BIT(___GFP_ACCOUNT_BIT)
+#define ___GFP_ZEROTAGS BIT(___GFP_ZEROTAGS_BIT)
#ifdef CONFIG_KASAN_HW_TAGS
-#define ___GFP_SKIP_ZERO 0x1000000u
-#define ___GFP_SKIP_KASAN 0x2000000u
+#define ___GFP_SKIP_ZERO BIT(___GFP_SKIP_ZERO_BIT)
+#define ___GFP_SKIP_KASAN BIT(___GFP_SKIP_KASAN_BIT)
#else
#define ___GFP_SKIP_ZERO 0
#define ___GFP_SKIP_KASAN 0
#endif
#ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP 0x4000000u
+#define ___GFP_NOLOCKDEP BIT(___GFP_NOLOCKDEP_BIT)
#else
#define ___GFP_NOLOCKDEP 0
#endif
-/* If the above are modified, __GFP_BITS_SHIFT may need updating */
/*
* Physical address zone modifiers (see linux/mmzone.h - low four bits)
@@ -249,7 +283,7 @@ typedef unsigned int __bitwise gfp_t;
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
/* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT ___GFP_LAST_BIT
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/**
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 451c1dff0e87..00341b56d291 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -439,6 +439,13 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len)
kunmap_local(addr);
}
+/**
+ * memcpy_from_folio - Copy a range of bytes from a folio.
+ * @to: The memory to copy to.
+ * @folio: The folio to read from.
+ * @offset: The first byte in the folio to read.
+ * @len: The number of bytes to copy.
+ */
static inline void memcpy_from_folio(char *to, struct folio *folio,
size_t offset, size_t len)
{
@@ -460,6 +467,13 @@ static inline void memcpy_from_folio(char *to, struct folio *folio,
} while (len > 0);
}
+/**
+ * memcpy_to_folio - Copy a range of bytes to a folio.
+ * @folio: The folio to write to.
+ * @offset: The first byte in the folio to store to.
+ * @from: The memory to copy from.
+ * @len: The number of bytes to copy.
+ */
static inline void memcpy_to_folio(struct folio *folio, size_t offset,
const char *from, size_t len)
{
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 5adb86af35fc..de0c89105076 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -265,10 +265,11 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
void folio_prep_large_rmappable(struct folio *folio);
bool can_split_folio(struct folio *folio, int *pextra_pins);
-int split_huge_page_to_list(struct page *page, struct list_head *list);
+int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+ unsigned int new_order);
static inline int split_huge_page(struct page *page)
{
- return split_huge_page_to_list(page, NULL);
+ return split_huge_page_to_list_to_order(page, NULL, 0);
}
void deferred_split_folio(struct folio *folio);
@@ -422,7 +423,8 @@ can_split_folio(struct folio *folio, int *pextra_pins)
return false;
}
static inline int
-split_huge_page_to_list(struct page *page, struct list_head *list)
+split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
+ unsigned int new_order)
{
return 0;
}
@@ -519,17 +521,20 @@ static inline bool thp_migration_supported(void)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline int split_folio_to_list(struct folio *folio,
- struct list_head *list)
+static inline int split_folio_to_list_to_order(struct folio *folio,
+ struct list_head *list, int new_order)
{
- return split_huge_page_to_list(&folio->page, list);
+ return split_huge_page_to_list_to_order(&folio->page, list, new_order);
}
-static inline int split_folio(struct folio *folio)
+static inline int split_folio_to_order(struct folio *folio, int new_order)
{
- return split_folio_to_list(folio, NULL);
+ return split_folio_to_list_to_order(folio, NULL, new_order);
}
+#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0)
+#define split_folio(f) split_folio_to_order(f, 0)
+
/*
* archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
* limitations in the implementation like arm64 MTE can override this to
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c1ee640d87b1..77b30a8c6076 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -178,7 +178,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
extern int sysctl_hugetlb_shm_group;
-extern struct list_head huge_boot_pages;
+extern struct list_head huge_boot_pages[MAX_NUMNODES];
/* arch callbacks */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 400cb6c02176..060835bb82d5 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -15,7 +15,8 @@
#if !defined(__ASSEMBLY__)
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
+#include <linux/crash_reserve.h>
#include <asm/io.h>
#include <linux/range.h>
@@ -31,6 +32,7 @@ extern note_buf_t __percpu *crash_notes;
#include <linux/module.h>
#include <linux/highmem.h>
#include <asm/kexec.h>
+#include <linux/crash_core.h>
/* Verify architecture specific macros are defined */
@@ -378,13 +380,6 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image,
static inline int machine_kexec_post_load(struct kimage *image) { return 0; }
#endif
-extern void __crash_kexec(struct pt_regs *);
-extern void crash_kexec(struct pt_regs *);
-int kexec_should_crash(struct task_struct *);
-int kexec_crash_loaded(void);
-void crash_save_cpu(struct pt_regs *regs, int cpu);
-extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);
-
extern struct kimage *kexec_image;
extern struct kimage *kexec_crash_image;
@@ -408,24 +403,6 @@ bool kexec_load_permitted(int kexec_image_type);
/* flag to track if kexec reboot is in progress */
extern bool kexec_in_progress;
-int crash_shrink_memory(unsigned long new_size);
-ssize_t crash_get_memory_size(void);
-
-#ifndef arch_kexec_protect_crashkres
-/*
- * Protection mechanism for crashkernel reserved memory after
- * the kdump kernel is loaded.
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-static inline void arch_kexec_protect_crashkres(void) { }
-#endif
-
-#ifndef arch_kexec_unprotect_crashkres
-static inline void arch_kexec_unprotect_crashkres(void) { }
-#endif
-
#ifndef page_to_boot_pfn
static inline unsigned long page_to_boot_pfn(struct page *page)
{
@@ -482,24 +459,6 @@ static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, g
static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
#endif
-#ifndef arch_crash_handle_hotplug_event
-static inline void arch_crash_handle_hotplug_event(struct kimage *image) { }
-#endif
-
-int crash_check_update_elfcorehdr(void);
-
-#ifndef crash_hotplug_cpu_support
-static inline int crash_hotplug_cpu_support(void) { return 0; }
-#endif
-
-#ifndef crash_hotplug_memory_support
-static inline int crash_hotplug_memory_support(void) { return 0; }
-#endif
-
-#ifndef crash_get_elfcorehdr_size
-static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; }
-#endif
-
extern bool kexec_file_dbg_print;
#define kexec_dprintk(fmt, ...) \
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index 7675a48a0701..792b67ceb631 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -24,6 +24,8 @@ enum lru_status {
LRU_SKIP, /* item cannot be locked, skip */
LRU_RETRY, /* item not freeable. May drop the lock
internally, but has to return locked. */
+ LRU_STOP, /* stop lru list walking. May drop the lock
+ internally, but has to return locked. */
};
struct list_lru_one {
@@ -62,8 +64,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
#define list_lru_init(lru) \
__list_lru_init((lru), false, NULL, NULL)
-#define list_lru_init_key(lru, key) \
- __list_lru_init((lru), false, (key), NULL)
#define list_lru_init_memcg(lru, shrinker) \
__list_lru_init((lru), true, NULL, shrinker)
@@ -170,22 +170,6 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
struct list_head *head);
-/**
- * list_lru_putback: undo list_lru_isolate
- * @lru: the lru pointer.
- * @item: the item to put back.
- * @nid: the node id of the sublist to put the item back to.
- * @memcg: the cgroup of the sublist to put the item back to.
- *
- * Put back an isolated item into its original LRU. Note that unlike
- * list_lru_add, this does not increment the node LRU count (as
- * list_lru_isolate does not originally decrement this count).
- *
- * Since we might have dropped the LRU lock in between, recompute list_lru_one
- * from the node's id and memcg.
- */
-void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
- struct mem_cgroup *memcg);
typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 20ff87f8e001..394fd0a887ae 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -14,6 +14,7 @@
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
+#include <linux/kernel.h>
#include <linux/page_counter.h>
#include <linux/vmpressure.h>
#include <linux/eventfd.h>
@@ -712,18 +713,16 @@ static inline void mem_cgroup_uncharge(struct folio *folio)
__mem_cgroup_uncharge(folio);
}
-void __mem_cgroup_uncharge_list(struct list_head *page_list);
-static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
+void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
+static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
if (mem_cgroup_disabled())
return;
- __mem_cgroup_uncharge_list(page_list);
+ __mem_cgroup_uncharge_folios(folios);
}
void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
-
void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
-
void mem_cgroup_migrate(struct folio *old, struct folio *new);
/**
@@ -1162,7 +1161,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
rcu_read_unlock();
}
-void split_page_memcg(struct page *head, unsigned int nr);
+void split_page_memcg(struct page *head, int old_order, int new_order);
unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
@@ -1294,7 +1293,7 @@ static inline void mem_cgroup_uncharge(struct folio *folio)
{
}
-static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
+static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
}
@@ -1620,7 +1619,7 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}
-static inline void split_page_memcg(struct page *head, unsigned int nr)
+static inline void split_page_memcg(struct page *head, int old_order, int new_order)
{
}
@@ -1694,18 +1693,18 @@ static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
return folio_lruvec_lock_irq(folio);
}
-/* Don't lock again iff page's lruvec locked */
-static inline struct lruvec *folio_lruvec_relock_irqsave(struct folio *folio,
- struct lruvec *locked_lruvec, unsigned long *flags)
+/* Don't lock again iff folio's lruvec locked */
+static inline void folio_lruvec_relock_irqsave(struct folio *folio,
+ struct lruvec **lruvecp, unsigned long *flags)
{
- if (locked_lruvec) {
- if (folio_matches_lruvec(folio, locked_lruvec))
- return locked_lruvec;
+ if (*lruvecp) {
+ if (folio_matches_lruvec(folio, *lruvecp))
+ return;
- unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
+ unlock_page_lruvec_irqrestore(*lruvecp, *flags);
}
- return folio_lruvec_lock_irqsave(folio, flags);
+ *lruvecp = folio_lruvec_lock_irqsave(folio, flags);
}
#ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f53cfdaaaa41..939a16bd5cea 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -96,8 +96,17 @@ int set_memory_block_size_order(unsigned int order);
#define MEM_GOING_ONLINE (1<<3)
#define MEM_CANCEL_ONLINE (1<<4)
#define MEM_CANCEL_OFFLINE (1<<5)
+#define MEM_PREPARE_ONLINE (1<<6)
+#define MEM_FINISH_OFFLINE (1<<7)
struct memory_notify {
+ /*
+ * The altmap_start_pfn and altmap_nr_pages fields are designated for
+ * specifying the altmap range and are exclusively intended for use in
+ * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
+ */
+ unsigned long altmap_start_pfn;
+ unsigned long altmap_nr_pages;
unsigned long start_pfn;
unsigned long nr_pages;
int status_change_nid_normal;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7d2076583494..7a9ff464608d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -106,6 +106,22 @@ typedef int __bitwise mhp_t;
* implies the node id (nid).
*/
#define MHP_NID_IS_MGID ((__force mhp_t)BIT(2))
+/*
+ * The hotplugged memory is completely inaccessible while the memory is
+ * offline. The memory provider will handle MEM_PREPARE_ONLINE /
+ * MEM_FINISH_OFFLINE notifications and make the memory accessible.
+ *
+ * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY,
+ * because the altmap cannot be written (e.g., poisoned) when adding
+ * memory -- before it is set online.
+ *
+ * This allows for adding memory with an altmap that is not currently
+ * made available by a hypervisor. When onlining that memory, the
+ * hypervisor can be instructed to make that memory available, and
+ * the onlining phase will not require any memory allocations, which is
+ * helpful in low-memory situations.
+ */
+#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3))
/*
* Extended parameters for memory hotplug:
@@ -121,6 +137,7 @@ struct mhp_params {
bool mhp_range_allowed(u64 start, u64 size, bool need_mapping);
struct range mhp_get_pluggable_range(bool need_mapping);
+bool mhp_supports_memmap_on_memory(void);
/*
* Zone resizing functions
@@ -154,7 +171,7 @@ extern void adjust_present_page_count(struct page *page,
long nr_pages);
/* VM interface that may be used by firmware interface */
extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
- struct zone *zone);
+ struct zone *zone, bool mhp_off_inaccessible);
extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group);
@@ -262,6 +279,11 @@ static inline bool movable_node_is_enabled(void)
return false;
}
+static inline bool mhp_supports_memmap_on_memory(void)
+{
+ return false;
+}
+
static inline void pgdat_kswapd_lock(pg_data_t *pgdat) {}
static inline void pgdat_kswapd_unlock(pg_data_t *pgdat) {}
static inline void pgdat_kswapd_lock_init(pg_data_t *pgdat) {}
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 744c830f4b13..3f7143ade32c 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -25,6 +25,7 @@ struct vmem_altmap {
unsigned long free;
unsigned long align;
unsigned long alloc;
+ bool inaccessible;
};
/*
@@ -108,7 +109,7 @@ struct dev_pagemap_ops {
* @altmap: pre-allocated/reserved memory for vmemmap allocations
* @ref: reference count that pins the devm_memremap_pages() mapping
* @done: completion for @ref
- * @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @type: memory type: see MEMORY_* above in memremap.h
* @flags: PGMAP_* flags to specify defailed behavior
* @vmemmap_shift: structural definition of how the vmemmap page metadata
* is populated, specifically the metadata page order.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec5169..699e850d143c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -36,6 +36,7 @@ struct anon_vma;
struct anon_vma_chain;
struct user_struct;
struct pt_regs;
+struct folio_batch;
extern int sysctl_page_lock_unfairness;
@@ -226,7 +227,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)
-#define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
static inline struct folio *lru_to_folio(struct list_head *head)
{
return list_entry((head)->prev, struct folio, lru);
@@ -781,6 +781,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
return NULL;
}
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+ mmap_assert_locked(vma->vm_mm);
+}
+
static inline void release_fault_lock(struct vm_fault *vmf)
{
mmap_read_unlock(vmf->vma->vm_mm);
@@ -1178,7 +1183,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)
* How many times the entire folio is mapped as a single unit (eg by a
* PMD or PUD entry). This is probably not what you want, except for
* debugging purposes - it does not include PTE-mapped sub-pages; look
- * at folio_mapcount() or page_mapcount() or total_mapcount() instead.
+ * at folio_mapcount() or page_mapcount() instead.
*/
static inline int folio_entire_mapcount(struct folio *folio)
{
@@ -1238,13 +1243,6 @@ static inline int folio_mapcount(struct folio *folio)
return folio_total_mapcount(folio);
}
-static inline int total_mapcount(struct page *page)
-{
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) + 1;
- return folio_total_mapcount(page_folio(page));
-}
-
static inline bool folio_large_is_mapped(struct folio *folio)
{
/*
@@ -1514,6 +1512,8 @@ static inline void folio_put_refs(struct folio *folio, int refs)
__folio_put(folio);
}
+void folios_put_refs(struct folio_batch *folios, unsigned int *refs);
+
/*
* union release_pages_arg - an array of pages or folios
*
@@ -1536,18 +1536,19 @@ void release_pages(release_pages_arg, int nr);
/**
* folios_put - Decrement the reference count on an array of folios.
* @folios: The folios.
- * @nr: How many folios there are.
*
- * Like folio_put(), but for an array of folios. This is more efficient
- * than writing the loop yourself as it will optimise the locks which
- * need to be taken if the folios are freed.
+ * Like folio_put(), but for a batch of folios. This is more efficient
+ * than writing the loop yourself as it will optimise the locks which need
+ * to be taken if the folios are freed. The folios batch is returned
+ * empty and ready to be reused for another batch; there is no need to
+ * reinitialise it.
*
* Context: May be called in process or interrupt context, but not in NMI
* context. May be called while holding a spinlock.
*/
-static inline void folios_put(struct folio **folios, unsigned int nr)
+static inline void folios_put(struct folio_batch *folios)
{
- release_pages(folios, nr);
+ folios_put_refs(folios, NULL);
}
static inline void put_page(struct page *page)
@@ -1640,13 +1641,11 @@ static inline int page_zone_id(struct page *page)
}
#ifdef NODE_NOT_IN_PAGE_FLAGS
-extern int page_to_nid(const struct page *page);
+int page_to_nid(const struct page *page);
#else
static inline int page_to_nid(const struct page *page)
{
- struct page *p = (struct page *)page;
-
- return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
+ return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK;
}
#endif
@@ -2065,6 +2064,13 @@ static inline long folio_nr_pages(struct folio *folio)
#endif
}
+/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER)
+#else
+#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES
+#endif
+
/*
* compound_nr() returns the number of pages in this potentially compound
* page. compound_nr() can be called on a tail page, and is defined to
@@ -2595,19 +2601,19 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member)
mm_trace_rss_stat(mm, member);
}
-/* Optimized variant when page is already known not to be PageAnon */
-static inline int mm_counter_file(struct page *page)
+/* Optimized variant when folio is already known not to be anon */
+static inline int mm_counter_file(struct folio *folio)
{
- if (PageSwapBacked(page))
+ if (folio_test_swapbacked(folio))
return MM_SHMEMPAGES;
return MM_FILEPAGES;
}
-static inline int mm_counter(struct page *page)
+static inline int mm_counter(struct folio *folio)
{
- if (PageAnon(page))
+ if (folio_test_anon(folio))
return MM_ANONPAGES;
- return mm_counter_file(page);
+ return mm_counter_file(folio);
}
static inline unsigned long get_mm_rss(struct mm_struct *mm)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b611e13153e..5240bd7bca33 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -210,8 +210,8 @@ struct page {
*
* An 'encoded_page' pointer is a pointer to a regular 'struct page', but
* with the low bits of the pointer indicating extra context-dependent
- * information. Not super-common, but happens in mmu_gather and mlock
- * handling, and this acts as a type system check on that use.
+ * information. Only used in mmu_gather handling, and this acts as a type
+ * system check on that use.
*
* We only really have two guaranteed bits in general, although you could
* play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
@@ -220,21 +220,46 @@ struct page {
* Use the supplied helper functions to endcode/decode the pointer and bits.
*/
struct encoded_page;
-#define ENCODE_PAGE_BITS 3ul
+
+#define ENCODED_PAGE_BITS 3ul
+
+/* Perform rmap removal after we have flushed the TLB. */
+#define ENCODED_PAGE_BIT_DELAY_RMAP 1ul
+
+/*
+ * The next item in an encoded_page array is the "nr_pages" argument, specifying
+ * the number of consecutive pages starting from this page, that all belong to
+ * the same folio. For example, "nr_pages" corresponds to the number of folio
+ * references that must be dropped. If this bit is not set, "nr_pages" is
+ * implicitly 1.
+ */
+#define ENCODED_PAGE_BIT_NR_PAGES_NEXT 2ul
+
static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags)
{
- BUILD_BUG_ON(flags > ENCODE_PAGE_BITS);
+ BUILD_BUG_ON(flags > ENCODED_PAGE_BITS);
return (struct encoded_page *)(flags | (unsigned long)page);
}
static inline unsigned long encoded_page_flags(struct encoded_page *page)
{
- return ENCODE_PAGE_BITS & (unsigned long)page;
+ return ENCODED_PAGE_BITS & (unsigned long)page;
}
static inline struct page *encoded_page_ptr(struct encoded_page *page)
{
- return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page);
+ return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page);
+}
+
+static __always_inline struct encoded_page *encode_nr_pages(unsigned long nr)
+{
+ VM_WARN_ON_ONCE((nr << 2) >> 2 != nr);
+ return (struct encoded_page *)(nr << 2);
+}
+
+static __always_inline unsigned long encoded_nr_pages(struct encoded_page *page)
+{
+ return ((unsigned long)page) >> 2;
}
/*
@@ -394,12 +419,13 @@ FOLIO_MATCH(compound_head, _head_2a);
/**
* struct ptdesc - Memory descriptor for page tables.
- * @__page_flags: Same as page flags. Unused for page tables.
+ * @__page_flags: Same as page flags. Powerpc only.
* @pt_rcu_head: For freeing page table pages.
* @pt_list: List of used page tables. Used for s390 and x86.
* @_pt_pad_1: Padding that aliases with page's compound head.
* @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
* @__page_mapping: Aliases with page->mapping. Unused for page tables.
+ * @pt_index: Used for s390 gmap.
* @pt_mm: Used for x86 pgds.
* @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
* @_pt_pad_2: Padding to ensure proper alignment.
@@ -425,6 +451,7 @@ struct ptdesc {
unsigned long __page_mapping;
union {
+ pgoff_t pt_index;
struct mm_struct *pt_mm;
atomic_t pt_frag_refcount;
};
@@ -450,6 +477,7 @@ TABLE_MATCH(flags, __page_flags);
TABLE_MATCH(compound_head, pt_list);
TABLE_MATCH(compound_head, _pt_pad_1);
TABLE_MATCH(mapping, __page_mapping);
+TABLE_MATCH(index, pt_index);
TABLE_MATCH(rcu_head, pt_rcu_head);
TABLE_MATCH(page_type, __page_type);
TABLE_MATCH(_refcount, __page_refcount);
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 7c3e7b0b0e8f..39a7714605a7 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -10,7 +10,7 @@ struct vm_area_struct;
struct mm_struct;
struct vma_iterator;
-void dump_page(struct page *page, const char *reason);
+void dump_page(const struct page *page, const char *reason);
void dump_vma(const struct vm_area_struct *vma);
void dump_mm(const struct mm_struct *mm);
void vma_iter_dump_tree(const struct vma_iterator *vmi);
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index f2b7a3f04099..bbaec80c78c5 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -11,7 +11,7 @@
#endif
#ifndef leave_mm
-static inline void leave_mm(int cpu) { }
+static inline void leave_mm(void) { }
#endif
/*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a497f189d988..c11b7cde81ef 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -76,9 +76,12 @@ extern const char * const migratetype_names[MIGRATE_TYPES];
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
+# define is_migrate_cma_folio(folio, pfn) (MIGRATE_CMA == \
+ get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK))
#else
# define is_migrate_cma(migratetype) false
# define is_migrate_cma_page(_page) false
+# define is_migrate_cma_folio(folio, pfn) false
#endif
static inline bool is_migrate_movable(int mt)
@@ -464,7 +467,7 @@ enum {
#define NR_BLOOM_FILTERS 2
struct lru_gen_mm_state {
- /* set to max_seq after each iteration */
+ /* synced with max_seq after each iteration */
unsigned long seq;
/* where the current iteration continues after */
struct list_head *head;
@@ -479,8 +482,8 @@ struct lru_gen_mm_state {
struct lru_gen_mm_walk {
/* the lruvec under reclaim */
struct lruvec *lruvec;
- /* unstable max_seq from lru_gen_folio */
- unsigned long max_seq;
+ /* max_seq from lru_gen_folio: can be out of date */
+ unsigned long seq;
/* the next address within an mm to scan */
unsigned long next_addr;
/* to batch promoted pages */
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 001b2ce83832..89b1e0ed9811 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -115,6 +115,14 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *mod);
+#ifdef CONFIG_MODULES
+void flush_module_init_free_work(void);
+#else
+static inline void flush_module_init_free_work(void)
+{
+}
+#endif
+
/* Any cleanup needed when module leaves. */
void module_arch_cleanup(struct module *mod);
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 495b16b6b4d7..0146daf34430 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -137,6 +137,7 @@ struct padata_shell {
* appropriate for one worker thread to do at once.
* @max_threads: Max threads to use for the job, actual number may be less
* depending on task size and minimum chunk size.
+ * @numa_aware: Distribute jobs to different nodes with CPU in a round robin fashion.
*/
struct padata_mt_job {
void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
@@ -146,6 +147,7 @@ struct padata_mt_job {
unsigned long align;
unsigned long min_chunk;
int max_threads;
+ bool numa_aware;
};
/**
@@ -178,10 +180,6 @@ struct padata_instance {
#ifdef CONFIG_PADATA
extern void __init padata_init(void);
-#else
-static inline void __init padata_init(void) {}
-#endif
-
extern struct padata_instance *padata_alloc(const char *name);
extern void padata_free(struct padata_instance *pinst);
extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
@@ -192,4 +190,12 @@ extern void padata_do_serial(struct padata_priv *padata);
extern void __init padata_do_multithreaded(struct padata_mt_job *job);
extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask);
+#else
+static inline void __init padata_init(void) {}
+static inline void __init padata_do_multithreaded(struct padata_mt_job *job)
+{
+ job->thread_fn(job->start, job->start + job->size, job->fn_arg);
+}
+#endif
+
#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 735cddc13d20..652d77805e99 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -237,7 +237,7 @@ static inline const struct page *page_fixed_fake_head(const struct page *page)
}
#endif
-static __always_inline int page_is_fake_head(struct page *page)
+static __always_inline int page_is_fake_head(const struct page *page)
{
return page_fixed_fake_head(page) != page;
}
@@ -281,12 +281,12 @@ static inline unsigned long _compound_head(const struct page *page)
*/
#define folio_page(folio, n) nth_page(&(folio)->page, n)
-static __always_inline int PageTail(struct page *page)
+static __always_inline int PageTail(const struct page *page)
{
return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page);
}
-static __always_inline int PageCompound(struct page *page)
+static __always_inline int PageCompound(const struct page *page)
{
return test_bit(PG_head, &page->flags) ||
READ_ONCE(page->compound_head) & 1;
@@ -306,6 +306,16 @@ static inline void page_init_poison(struct page *page, size_t size)
}
#endif
+static const unsigned long *const_folio_flags(const struct folio *folio,
+ unsigned n)
+{
+ const struct page *page = &folio->page;
+
+ VM_BUG_ON_PGFLAGS(PageTail(page), page);
+ VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
+ return &page[n].flags;
+}
+
static unsigned long *folio_flags(struct folio *folio, unsigned n)
{
struct page *page = &folio->page;
@@ -328,9 +338,6 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n)
* for compound page all operations related to the page flag applied to
* head page.
*
- * PF_ONLY_HEAD:
- * for compound page, callers only ever operate on the head page.
- *
* PF_NO_TAIL:
* modifications of the page flag must be done on small or head pages,
* checks can be done on tail pages too.
@@ -346,9 +353,6 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n)
page; })
#define PF_ANY(page, enforce) PF_POISONED_CHECK(page)
#define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page))
-#define PF_ONLY_HEAD(page, enforce) ({ \
- VM_BUG_ON_PGFLAGS(PageTail(page), page); \
- PF_POISONED_CHECK(page); })
#define PF_NO_TAIL(page, enforce) ({ \
VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \
PF_POISONED_CHECK(compound_head(page)); })
@@ -362,59 +366,81 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n)
/* Which page is the flag stored in */
#define FOLIO_PF_ANY 0
#define FOLIO_PF_HEAD 0
-#define FOLIO_PF_ONLY_HEAD 0
#define FOLIO_PF_NO_TAIL 0
#define FOLIO_PF_NO_COMPOUND 0
#define FOLIO_PF_SECOND 1
+#define FOLIO_HEAD_PAGE 0
+#define FOLIO_SECOND_PAGE 1
+
/*
* Macros to create function definitions for page flags
*/
+#define FOLIO_TEST_FLAG(name, page) \
+static __always_inline bool folio_test_##name(const struct folio *folio) \
+{ return test_bit(PG_##name, const_folio_flags(folio, page)); }
+
+#define FOLIO_SET_FLAG(name, page) \
+static __always_inline void folio_set_##name(struct folio *folio) \
+{ set_bit(PG_##name, folio_flags(folio, page)); }
+
+#define FOLIO_CLEAR_FLAG(name, page) \
+static __always_inline void folio_clear_##name(struct folio *folio) \
+{ clear_bit(PG_##name, folio_flags(folio, page)); }
+
+#define __FOLIO_SET_FLAG(name, page) \
+static __always_inline void __folio_set_##name(struct folio *folio) \
+{ __set_bit(PG_##name, folio_flags(folio, page)); }
+
+#define __FOLIO_CLEAR_FLAG(name, page) \
+static __always_inline void __folio_clear_##name(struct folio *folio) \
+{ __clear_bit(PG_##name, folio_flags(folio, page)); }
+
+#define FOLIO_TEST_SET_FLAG(name, page) \
+static __always_inline bool folio_test_set_##name(struct folio *folio) \
+{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); }
+
+#define FOLIO_TEST_CLEAR_FLAG(name, page) \
+static __always_inline bool folio_test_clear_##name(struct folio *folio) \
+{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); }
+
+#define FOLIO_FLAG(name, page) \
+FOLIO_TEST_FLAG(name, page) \
+FOLIO_SET_FLAG(name, page) \
+FOLIO_CLEAR_FLAG(name, page)
+
#define TESTPAGEFLAG(uname, lname, policy) \
-static __always_inline bool folio_test_##lname(struct folio *folio) \
-{ return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
-static __always_inline int Page##uname(struct page *page) \
+FOLIO_TEST_FLAG(lname, FOLIO_##policy) \
+static __always_inline int Page##uname(const struct page *page) \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
#define SETPAGEFLAG(uname, lname, policy) \
-static __always_inline \
-void folio_set_##lname(struct folio *folio) \
-{ set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
+FOLIO_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline void SetPage##uname(struct page *page) \
{ set_bit(PG_##lname, &policy(page, 1)->flags); }
#define CLEARPAGEFLAG(uname, lname, policy) \
-static __always_inline \
-void folio_clear_##lname(struct folio *folio) \
-{ clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
+FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline void ClearPage##uname(struct page *page) \
{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define __SETPAGEFLAG(uname, lname, policy) \
-static __always_inline \
-void __folio_set_##lname(struct folio *folio) \
-{ __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
+__FOLIO_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline void __SetPage##uname(struct page *page) \
{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
#define __CLEARPAGEFLAG(uname, lname, policy) \
-static __always_inline \
-void __folio_clear_##lname(struct folio *folio) \
-{ __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
+__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline void __ClearPage##uname(struct page *page) \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
#define TESTSETFLAG(uname, lname, policy) \
-static __always_inline \
-bool folio_test_set_##lname(struct folio *folio) \
-{ return test_and_set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
+FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \
static __always_inline int TestSetPage##uname(struct page *page) \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
#define TESTCLEARFLAG(uname, lname, policy) \
-static __always_inline \
-bool folio_test_clear_##lname(struct folio *folio) \
-{ return test_and_clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
+FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \
static __always_inline int TestClearPage##uname(struct page *page) \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
@@ -465,7 +491,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname)
__PAGEFLAG(Locked, locked, PF_NO_TAIL)
-PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
+FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE)
PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
PAGEFLAG(Referenced, referenced, PF_HEAD)
TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
@@ -532,13 +558,13 @@ PAGEFLAG_FALSE(HighMem, highmem)
#endif
#ifdef CONFIG_SWAP
-static __always_inline bool folio_test_swapcache(struct folio *folio)
+static __always_inline bool folio_test_swapcache(const struct folio *folio)
{
return folio_test_swapbacked(folio) &&
- test_bit(PG_swapcache, folio_flags(folio, 0));
+ test_bit(PG_swapcache, const_folio_flags(folio, 0));
}
-static __always_inline bool PageSwapCache(struct page *page)
+static __always_inline bool PageSwapCache(const struct page *page)
{
return folio_test_swapcache(page_folio(page));
}
@@ -583,10 +609,10 @@ PAGEFLAG_FALSE(HWPoison, hwpoison)
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
-TESTPAGEFLAG(Young, young, PF_ANY)
-SETPAGEFLAG(Young, young, PF_ANY)
-TESTCLEARFLAG(Young, young, PF_ANY)
-PAGEFLAG(Idle, idle, PF_ANY)
+FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE)
+FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE)
+FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE)
+FOLIO_FLAG(idle, FOLIO_HEAD_PAGE)
#endif
/*
@@ -637,22 +663,22 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
*/
#define PAGE_MAPPING_DAX_SHARED ((void *)0x1)
-static __always_inline bool folio_mapping_flags(struct folio *folio)
+static __always_inline bool folio_mapping_flags(const struct folio *folio)
{
return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
}
-static __always_inline int PageMappingFlags(struct page *page)
+static __always_inline int PageMappingFlags(const struct page *page)
{
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
}
-static __always_inline bool folio_test_anon(struct folio *folio)
+static __always_inline bool folio_test_anon(const struct folio *folio)
{
return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
}
-static __always_inline bool PageAnon(struct page *page)
+static __always_inline bool PageAnon(const struct page *page)
{
return folio_test_anon(page_folio(page));
}
@@ -663,7 +689,7 @@ static __always_inline bool __folio_test_movable(const struct folio *folio)
PAGE_MAPPING_MOVABLE;
}
-static __always_inline int __PageMovable(struct page *page)
+static __always_inline int __PageMovable(const struct page *page)
{
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
PAGE_MAPPING_MOVABLE;
@@ -676,13 +702,13 @@ static __always_inline int __PageMovable(struct page *page)
* is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any
* anon_vma, but to that page's node of the stable tree.
*/
-static __always_inline bool folio_test_ksm(struct folio *folio)
+static __always_inline bool folio_test_ksm(const struct folio *folio)
{
return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
PAGE_MAPPING_KSM;
}
-static __always_inline bool PageKsm(struct page *page)
+static __always_inline bool PageKsm(const struct page *page)
{
return folio_test_ksm(page_folio(page));
}
@@ -721,9 +747,9 @@ static inline bool folio_xor_flags_has_waiters(struct folio *folio,
* some of the bytes in it may be; see the is_partially_uptodate()
* address_space operation.
*/
-static inline bool folio_test_uptodate(struct folio *folio)
+static inline bool folio_test_uptodate(const struct folio *folio)
{
- bool ret = test_bit(PG_uptodate, folio_flags(folio, 0));
+ bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0));
/*
* Must ensure that the data we read out of the folio is loaded
* _after_ we've loaded folio->flags to check the uptodate bit.
@@ -738,7 +764,7 @@ static inline bool folio_test_uptodate(struct folio *folio)
return ret;
}
-static inline int PageUptodate(struct page *page)
+static inline int PageUptodate(const struct page *page)
{
return folio_test_uptodate(page_folio(page));
}
@@ -780,12 +806,12 @@ void set_page_writeback(struct page *page);
#define folio_start_writeback_keepwrite(folio) \
__folio_start_writeback(folio, true)
-static __always_inline bool folio_test_head(struct folio *folio)
+static __always_inline bool folio_test_head(const struct folio *folio)
{
- return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY));
+ return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY));
}
-static __always_inline int PageHead(struct page *page)
+static __always_inline int PageHead(const struct page *page)
{
PF_POISONED_CHECK(page);
return test_bit(PG_head, &page->flags) && !page_is_fake_head(page);
@@ -801,7 +827,7 @@ CLEARPAGEFLAG(Head, head, PF_ANY)
*
* Return: True if the folio is larger than one page.
*/
-static inline bool folio_test_large(struct folio *folio)
+static inline bool folio_test_large(const struct folio *folio)
{
return folio_test_head(folio);
}
@@ -830,7 +856,7 @@ TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable)
#define PG_head_mask ((1UL << PG_head))
#ifdef CONFIG_HUGETLB_PAGE
-int PageHuge(struct page *page);
+int PageHuge(const struct page *page);
SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
@@ -843,10 +869,10 @@ CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
* Return: True for hugetlbfs folios, false for anon folios or folios
* belonging to other filesystems.
*/
-static inline bool folio_test_hugetlb(struct folio *folio)
+static inline bool folio_test_hugetlb(const struct folio *folio)
{
return folio_test_large(folio) &&
- test_bit(PG_hugetlb, folio_flags(folio, 1));
+ test_bit(PG_hugetlb, const_folio_flags(folio, 1));
}
#else
TESTPAGEFLAG_FALSE(Huge, hugetlb)
@@ -861,7 +887,7 @@ TESTPAGEFLAG_FALSE(Huge, hugetlb)
* hugetlbfs pages, but not normal pages. PageTransHuge() can only be
* called only in the core VM paths where hugetlbfs pages can't exist.
*/
-static inline int PageTransHuge(struct page *page)
+static inline int PageTransHuge(const struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
return PageHead(page);
@@ -872,7 +898,7 @@ static inline int PageTransHuge(struct page *page)
* and hugetlbfs pages, so it should only be called when it's known
* that hugetlbfs pages aren't involved.
*/
-static inline int PageTransCompound(struct page *page)
+static inline int PageTransCompound(const struct page *page)
{
return PageCompound(page);
}
@@ -882,7 +908,7 @@ static inline int PageTransCompound(struct page *page)
* and hugetlbfs pages, so it should only be called when it's known
* that hugetlbfs pages aren't involved.
*/
-static inline int PageTransTail(struct page *page)
+static inline int PageTransTail(const struct page *page)
{
return PageTail(page);
}
@@ -946,7 +972,7 @@ static inline int page_type_has_type(unsigned int page_type)
return (int)page_type < PAGE_MAPCOUNT_RESERVE;
}
-static inline int page_has_type(struct page *page)
+static inline int page_has_type(const struct page *page)
{
return page_type_has_type(page->page_type);
}
@@ -1030,7 +1056,7 @@ extern bool is_free_buddy_page(struct page *page);
PAGEFLAG(Isolated, isolated, PF_ANY);
-static __always_inline int PageAnonExclusive(struct page *page)
+static __always_inline int PageAnonExclusive(const struct page *page)
{
VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
@@ -1103,19 +1129,18 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
* Determine if a page has private stuff, indicating that release routines
* should be invoked upon it.
*/
-static inline int page_has_private(struct page *page)
+static inline int page_has_private(const struct page *page)
{
return !!(page->flags & PAGE_FLAGS_PRIVATE);
}
-static inline bool folio_has_private(struct folio *folio)
+static inline bool folio_has_private(const struct folio *folio)
{
return page_has_private(&folio->page);
}
#undef PF_ANY
#undef PF_HEAD
-#undef PF_ONLY_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
#undef PF_SECOND
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index c141ea9a95ef..8cd858d912c4 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -4,7 +4,7 @@
#include <linux/atomic.h>
#include <linux/cache.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
#include <asm/page.h>
struct page_counter {
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 119a0c9d2a8b..debdc25f08b9 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -11,7 +11,8 @@ extern struct page_ext_operations page_owner_ops;
extern void __reset_page_owner(struct page *page, unsigned short order);
extern void __set_page_owner(struct page *page,
unsigned short order, gfp_t gfp_mask);
-extern void __split_page_owner(struct page *page, unsigned int nr);
+extern void __split_page_owner(struct page *page, int old_order,
+ int new_order);
extern void __folio_copy_owner(struct folio *newfolio, struct folio *old);
extern void __set_page_owner_migrate_reason(struct page *page, int reason);
extern void __dump_page_owner(const struct page *page);
@@ -31,10 +32,11 @@ static inline void set_page_owner(struct page *page,
__set_page_owner(page, order, gfp_mask);
}
-static inline void split_page_owner(struct page *page, unsigned int nr)
+static inline void split_page_owner(struct page *page, int old_order,
+ int new_order)
{
if (static_branch_unlikely(&page_owner_inited))
- __split_page_owner(page, nr);
+ __split_page_owner(page, old_order, new_order);
}
static inline void folio_copy_owner(struct folio *newfolio, struct folio *old)
{
@@ -56,11 +58,11 @@ static inline void reset_page_owner(struct page *page, unsigned short order)
{
}
static inline void set_page_owner(struct page *page,
- unsigned int order, gfp_t gfp_mask)
+ unsigned short order, gfp_t gfp_mask)
{
}
-static inline void split_page_owner(struct page *page,
- unsigned short order)
+static inline void split_page_owner(struct page *page, int old_order,
+ int new_order)
{
}
static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio)
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 87cc678adc85..fcc06c300a72 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -27,6 +27,7 @@ struct folio;
*/
struct folio_batch {
unsigned char nr;
+ unsigned char i;
bool percpu_pvec_drained;
struct folio *folios[PAGEVEC_SIZE];
};
@@ -40,12 +41,14 @@ struct folio_batch {
static inline void folio_batch_init(struct folio_batch *fbatch)
{
fbatch->nr = 0;
+ fbatch->i = 0;
fbatch->percpu_pvec_drained = false;
}
static inline void folio_batch_reinit(struct folio_batch *fbatch)
{
fbatch->nr = 0;
+ fbatch->i = 0;
}
static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
@@ -75,6 +78,21 @@ static inline unsigned folio_batch_add(struct folio_batch *fbatch,
return folio_batch_space(fbatch);
}
+/**
+ * folio_batch_next - Return the next folio to process.
+ * @fbatch: The folio batch being processed.
+ *
+ * Use this function to implement a queue of folios.
+ *
+ * Return: The next folio in the queue, or NULL if the queue is empty.
+ */
+static inline struct folio *folio_batch_next(struct folio_batch *fbatch)
+{
+ if (fbatch->i == fbatch->nr)
+ return NULL;
+ return fbatch->folios[fbatch->i++];
+}
+
void __folio_batch_release(struct folio_batch *pvec);
static inline void folio_batch_release(struct folio_batch *fbatch)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index f6d0e3513948..85fc7554cd52 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,15 +212,37 @@ static inline int pmd_dirty(pmd_t pmd)
#define arch_flush_lazy_mmu_mode() do {} while (0)
#endif
-#ifndef set_ptes
+#ifndef pte_batch_hint
+/**
+ * pte_batch_hint - Number of pages that can be added to batch without scanning.
+ * @ptep: Page table pointer for the entry.
+ * @pte: Page table entry.
+ *
+ * Some architectures know that a set of contiguous ptes all map the same
+ * contiguous memory with the same permissions. In this case, it can provide a
+ * hint to aid pte batching without the core code needing to scan every pte.
+ *
+ * An architecture implementation may ignore the PTE accessed state. Further,
+ * the dirty state must apply atomically to all the PTEs described by the hint.
+ *
+ * May be overridden by the architecture, else pte_batch_hint is always 1.
+ */
+static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+ return 1;
+}
+#endif
-#ifndef pte_next_pfn
-static inline pte_t pte_next_pfn(pte_t pte)
+#ifndef pte_advance_pfn
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
- return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+ return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#endif
+#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)
+
+#ifndef set_ptes
/**
* set_ptes - Map consecutive pages to a contiguous range of addresses.
* @mm: Address space to map the pages into.
@@ -229,6 +251,10 @@ static inline pte_t pte_next_pfn(pte_t pte)
* @pte: Page table entry for the first page.
* @nr: Number of pages to map.
*
+ * When nr==1, initial state of pte may be present or not present, and new state
+ * may be present or not present. When nr>1, initial state of all ptes must be
+ * not present, and new state must be present.
+ *
* May be overridden by the architecture, or the architecture can define
* set_pte() and PFN_PTE_SHIFT.
*
@@ -580,6 +606,76 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
}
#endif
+#ifndef get_and_clear_full_ptes
+/**
+ * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
+ * the same folio, collecting dirty/accessed bits.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
+ * returned PTE.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+ pte_t pte, tmp_pte;
+
+ pte = ptep_get_and_clear_full(mm, addr, ptep, full);
+ while (--nr) {
+ ptep++;
+ addr += PAGE_SIZE;
+ tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
+ if (pte_dirty(tmp_pte))
+ pte = pte_mkdirty(pte);
+ if (pte_young(tmp_pte))
+ pte = pte_mkyoung(pte);
+ }
+ return pte;
+}
+#endif
+
+#ifndef clear_full_ptes
+/**
+ * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
+ * folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ for (;;) {
+ ptep_get_and_clear_full(mm, addr, ptep, full);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+#endif
/*
* If two threads concurrently fault at the same page, the thread that
@@ -650,6 +746,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
}
#endif
+#ifndef wrprotect_ptes
+/**
+ * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
+ * folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to write-protect.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_set_wrprotect().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ for (;;) {
+ ptep_set_wrprotect(mm, addr, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+#endif
+
/*
* On some architectures hardware does not set page access bit when accessing
* memory page, it is responsibility of software setting this bit. It brings
@@ -1650,16 +1777,16 @@ typedef unsigned int pgtbl_mod_mask;
* Only meaningful when called on a valid entry.
*/
#ifndef pgd_leaf
-#define pgd_leaf(x) 0
+#define pgd_leaf(x) false
#endif
#ifndef p4d_leaf
-#define p4d_leaf(x) 0
+#define p4d_leaf(x) false
#endif
#ifndef pud_leaf
-#define pud_leaf(x) 0
+#define pud_leaf(x) false
#endif
#ifndef pmd_leaf
-#define pmd_leaf(x) 0
+#define pmd_leaf(x) false
#endif
#ifndef pgd_leaf_size
diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h
index 2a3a95586425..8dbd51ea8626 100644
--- a/include/linux/ptdump.h
+++ b/include/linux/ptdump.h
@@ -18,6 +18,16 @@ struct ptdump_state {
const struct ptdump_range *range;
};
+bool ptdump_walk_pgd_level_core(struct seq_file *m,
+ struct mm_struct *mm, pgd_t *pgd,
+ bool checkwx, bool dmesg);
void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd);
+bool ptdump_check_wx(void);
+
+static inline void debug_checkwx(void)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_WX))
+ ptdump_check_wx();
+}
#endif /* _LINUX_PTDUMP_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 17cb0761ff65..94f0e618865b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1261,6 +1261,7 @@ struct task_struct {
/* Protected by alloc_lock: */
struct mempolicy *mempolicy;
short il_prev;
+ u8 il_weight;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
@@ -1625,15 +1626,15 @@ extern struct pid *cad_pid;
#define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* Dumped core */
#define PF_SIGNALED 0x00000400 /* Killed by a signal */
-#define PF_MEMALLOC 0x00000800 /* Allocating memory */
+#define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */
#define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
#define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
#define PF__HOLE__00010000 0x00010000
#define PF_KSWAPD 0x00020000 /* I am kswapd */
-#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
-#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
+#define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
+#define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to,
* I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
@@ -1643,7 +1644,8 @@ extern struct pid *cad_pid;
#define PF__HOLE__02000000 0x02000000
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
-#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
+#define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning.
+ * See memalloc_pin_save() */
#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */
#define PF__HOLE__40000000 0x40000000
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 9a19f1b42f64..7a4066d22883 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -315,7 +315,8 @@ static inline void might_alloc(gfp_t gfp_mask)
* point of view. Use memalloc_noio_restore to end the scope with flags
* returned by this function.
*
- * This function is safe to be used from any context.
+ * Context: This function is safe to be used from any context.
+ * Return: The saved flags to be passed to memalloc_noio_restore.
*/
static inline unsigned int memalloc_noio_save(void)
{
@@ -346,7 +347,8 @@ static inline void memalloc_noio_restore(unsigned int flags)
* point of view. Use memalloc_nofs_restore to end the scope with flags
* returned by this function.
*
- * This function is safe to be used from any context.
+ * Context: This function is safe to be used from any context.
+ * Return: The saved flags to be passed to memalloc_nofs_restore.
*/
static inline unsigned int memalloc_nofs_save(void)
{
@@ -368,6 +370,29 @@ static inline void memalloc_nofs_restore(unsigned int flags)
current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
}
+/**
+ * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
+ *
+ * This function marks the beginning of the __GFP_MEMALLOC allocation scope.
+ * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
+ * prevents entering reclaim and allows access to all memory reserves. This
+ * should only be used when the caller guarantees the allocation will allow more
+ * memory to be freed very shortly, i.e. it needs to allocate some memory in
+ * the process of freeing memory, and cannot reclaim due to potential recursion.
+ *
+ * Users of this scope have to be extremely careful to not deplete the reserves
+ * completely and implement a throttling mechanism which controls the
+ * consumption of the reserve based on the amount of freed memory. Usage of a
+ * pre-allocated pool (e.g. mempool) should be always considered before using
+ * this scope.
+ *
+ * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
+ *
+ * Context: This function should not be used in an interrupt context as that one
+ * does not give PF_MEMALLOC access to reserves.
+ * See __gfp_pfmemalloc_flags().
+ * Return: The saved flags to be passed to memalloc_noreclaim_restore.
+ */
static inline unsigned int memalloc_noreclaim_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC;
@@ -375,11 +400,29 @@ static inline unsigned int memalloc_noreclaim_save(void)
return flags;
}
+/**
+ * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save
+ * function. Always make sure that the given flags is the return value from the
+ * pairing memalloc_noreclaim_save call.
+ */
static inline void memalloc_noreclaim_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
}
+/**
+ * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope.
+ *
+ * This function marks the beginning of the ~__GFP_MOVABLE allocation scope.
+ * All further allocations will implicitly remove the __GFP_MOVABLE flag, which
+ * will constraint the allocations to zones that allow long term pinning, i.e.
+ * not ZONE_MOVABLE zones.
+ *
+ * Return: The saved flags to be passed to memalloc_pin_restore.
+ */
static inline unsigned int memalloc_pin_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC_PIN;
@@ -388,6 +431,14 @@ static inline unsigned int memalloc_pin_save(void)
return flags;
}
+/**
+ * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope.
+ * @flags: Flags to restore.
+ *
+ * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function.
+ * Always make sure that the given flags is the return value from the pairing
+ * memalloc_pin_save call.
+ */
static inline void memalloc_pin_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index adcbb8f23600..3c6caa5abc7c 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -30,6 +30,53 @@ typedef u32 depot_stack_handle_t;
*/
#define STACK_DEPOT_EXTRA_BITS 5
+#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
+
+#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
+#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
+#define DEPOT_STACK_ALIGN 4
+#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
+#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
+ STACK_DEPOT_EXTRA_BITS)
+
+#ifdef CONFIG_STACKDEPOT
+/* Compact structure that stores a reference to a stack. */
+union handle_parts {
+ depot_stack_handle_t handle;
+ struct {
+ /* pool_index is offset by 1 */
+ u32 pool_index : DEPOT_POOL_INDEX_BITS;
+ u32 offset : DEPOT_OFFSET_BITS;
+ u32 extra : STACK_DEPOT_EXTRA_BITS;
+ };
+};
+
+struct stack_record {
+ struct list_head hash_list; /* Links in the hash table */
+ u32 hash; /* Hash in hash table */
+ u32 size; /* Number of stored frames */
+ union handle_parts handle; /* Constant after initialization */
+ refcount_t count;
+ union {
+ unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */
+ struct {
+ /*
+ * An important invariant of the implementation is to
+ * only place a stack record onto the freelist iff its
+ * refcount is zero. Because stack records with a zero
+ * refcount are never considered as valid, it is safe to
+ * union @entries and freelist management state below.
+ * Conversely, as soon as an entry is off the freelist
+ * and its refcount becomes non-zero, the below must not
+ * be accessed until being placed back on the freelist.
+ */
+ struct list_head free_list; /* Links in the freelist */
+ unsigned long rcu_state; /* RCU cookie */
+ };
+ };
+};
+#endif
+
typedef u32 depot_flags_t;
/*
@@ -132,6 +179,17 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
unsigned int nr_entries, gfp_t gfp_flags);
/**
+ * __stack_depot_get_stack_record - Get a pointer to a stack_record struct
+ *
+ * @handle: Stack depot handle
+ *
+ * This function is only for internal purposes.
+ *
+ * Return: Returns a pointer to a stack_record struct
+ */
+struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle);
+
+/**
* stack_depot_fetch - Fetch a stack trace from stack depot
*
* @handle: Stack depot handle returned from stack_depot_save()
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 95410ca891a7..f53d608daa01 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -437,9 +437,9 @@ static inline unsigned long total_swapcache_pages(void)
return global_node_page_state(NR_SWAPCACHE);
}
-extern void free_swap_cache(struct page *page);
-extern void free_page_and_swap_cache(struct page *);
-extern void free_pages_and_swap_cache(struct encoded_page **, int);
+void free_swap_cache(struct folio *folio);
+void free_page_and_swap_cache(struct page *);
+void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
@@ -521,7 +521,7 @@ static inline void put_swap_device(struct swap_info_struct *si)
/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
#define free_swap_and_cache(e) is_pfn_swap_entry(e)
-static inline void free_swap_cache(struct page *page)
+static inline void free_swap_cache(struct folio *folio)
{
}
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index bff1e8d97de0..48b700ba1d18 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -468,6 +468,19 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
return p;
}
+static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
+{
+ struct folio *folio = pfn_folio(swp_offset_pfn(entry));
+
+ /*
+ * Any use of migration entries may only occur while the
+ * corresponding folio is locked
+ */
+ BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio));
+
+ return folio;
+}
+
/*
* A pfn swap entry is a special type of swap entry that always has a pfn stored
* in the swap offset. They are used to represent unaddressable device memory
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e4056547fbe6..05d59f74fc88 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -36,6 +36,52 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ *
+ * Locking order:
+ * fd_wqh.lock
+ * fault_pending_wqh.lock
+ * fault_wqh.lock
+ * event_wqh.lock
+ *
+ * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
+ * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
+ * also taken in IRQ context.
+ */
+struct userfaultfd_ctx {
+ /* waitqueue head for the pending (i.e. not read) userfaults */
+ wait_queue_head_t fault_pending_wqh;
+ /* waitqueue head for the userfaults */
+ wait_queue_head_t fault_wqh;
+ /* waitqueue head for the pseudo fd to wakeup poll/read */
+ wait_queue_head_t fd_wqh;
+ /* waitqueue head for events */
+ wait_queue_head_t event_wqh;
+ /* a refile sequence protected by fault_pending_wqh lock */
+ seqcount_spinlock_t refile_seq;
+ /* pseudo fd refcounting */
+ refcount_t refcount;
+ /* userfaultfd syscall flags */
+ unsigned int flags;
+ /* features requested from the userspace */
+ unsigned int features;
+ /* released */
+ bool released;
+ /*
+ * Prevents userfaultfd operations (fill/move/wp) from happening while
+ * some non-cooperative event(s) is taking place. Increments are done
+ * in write-mode. Whereas, userfaultfd operations, which includes
+ * reading mmap_changing, is done under read-mode.
+ */
+ struct rw_semaphore map_changing_lock;
+ /* memory mappings are changing because of non-cooperative event */
+ atomic_t mmap_changing;
+ /* mm with one ore more vmas attached to this userfaultfd_ctx */
+ struct mm_struct *mm;
+};
+
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
/* A combined operation mode + behavior flags. */
@@ -74,31 +120,26 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
unsigned long dst_addr, struct page *page,
bool newly_allocated, uffd_flags_t flags);
-extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, uffd_flags_t flags);
-extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+ uffd_flags_t flags);
+extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
- unsigned long len,
- atomic_t *mmap_changing);
-extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags);
-extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags);
-extern int mwriteprotect_range(struct mm_struct *dst_mm,
- unsigned long start, unsigned long len,
- bool enable_wp, atomic_t *mmap_changing);
+ unsigned long len);
+extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long len, uffd_flags_t flags);
+extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags);
+extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, bool enable_wp);
extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
/* move_pages */
void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
-ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
- unsigned long dst_start, unsigned long src_start,
- unsigned long len, __u64 flags);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long src_start, unsigned long len, __u64 flags);
int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma,
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 0f72c85a377b..98ea90e90439 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -258,7 +258,6 @@ extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
/*
* Internals. Don't use..
*/
-extern struct list_head vmap_area_list;
extern __init void vm_area_add_early(struct vm_struct *vm);
extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
new file mode 100644
index 000000000000..e1dec1a6a749
--- /dev/null
+++ b/include/linux/vmcore_info.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_VMCORE_INFO_H
+#define LINUX_VMCORE_INFO_H
+
+#include <linux/linkage.h>
+#include <linux/elfcore.h>
+#include <linux/elf.h>
+
+#define CRASH_CORE_NOTE_NAME "CORE"
+#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
+#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
+#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
+
+/*
+ * The per-cpu notes area is a list of notes terminated by a "NULL"
+ * note header. For kdump, the code in vmcore.c runs in the context
+ * of the second kernel to combine them into one note.
+ */
+#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \
+ CRASH_CORE_NOTE_NAME_BYTES + \
+ CRASH_CORE_NOTE_DESC_BYTES)
+
+#define VMCOREINFO_BYTES PAGE_SIZE
+#define VMCOREINFO_NOTE_NAME "VMCOREINFO"
+#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
+#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \
+ VMCOREINFO_NOTE_NAME_BYTES + \
+ VMCOREINFO_BYTES)
+
+typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
+/* Per cpu memory for storing cpu states in case of system crash. */
+extern note_buf_t __percpu *crash_notes;
+
+void crash_update_vmcoreinfo_safecopy(void *ptr);
+void crash_save_vmcoreinfo(void);
+void arch_crash_save_vmcoreinfo(void);
+__printf(1, 2)
+void vmcoreinfo_append_str(const char *fmt, ...);
+phys_addr_t paddr_vmcoreinfo_note(void);
+
+#define VMCOREINFO_OSRELEASE(value) \
+ vmcoreinfo_append_str("OSRELEASE=%s\n", value)
+#define VMCOREINFO_BUILD_ID() \
+ ({ \
+ static_assert(sizeof(vmlinux_build_id) == 20); \
+ vmcoreinfo_append_str("BUILD-ID=%20phN\n", vmlinux_build_id); \
+ })
+
+#define VMCOREINFO_PAGESIZE(value) \
+ vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
+#define VMCOREINFO_SYMBOL(name) \
+ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SYMBOL_ARRAY(name) \
+ vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)name)
+#define VMCOREINFO_SIZE(name) \
+ vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+ (unsigned long)sizeof(name))
+#define VMCOREINFO_STRUCT_SIZE(name) \
+ vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+ (unsigned long)sizeof(struct name))
+#define VMCOREINFO_OFFSET(name, field) \
+ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+ (unsigned long)offsetof(struct name, field))
+#define VMCOREINFO_TYPE_OFFSET(name, field) \
+ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+ (unsigned long)offsetof(name, field))
+#define VMCOREINFO_LENGTH(name, value) \
+ vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
+#define VMCOREINFO_NUMBER(name) \
+ vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
+#define VMCOREINFO_CONFIG(name) \
+ vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+
+extern unsigned char *vmcoreinfo_data;
+extern size_t vmcoreinfo_size;
+extern u32 *vmcoreinfo_note;
+
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+ void *data, size_t data_len);
+void final_note(Elf_Word *buf);
+#endif /* LINUX_VMCORE_INFO_H */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 453736fd1d23..9845cb62e40b 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -11,6 +11,7 @@
#include <linux/flex_proportions.h>
#include <linux/backing-dev-defs.h>
#include <linux/blk_types.h>
+#include <linux/pagevec.h>
struct bio;
@@ -40,6 +41,7 @@ enum writeback_sync_modes {
* in a manner such that unspecified fields are set to zero.
*/
struct writeback_control {
+ /* public fields that can be set and/or consumed by the caller: */
long nr_to_write; /* Write this many pages, and decrement
this for each page written */
long pages_skipped; /* Pages which were not written */
@@ -77,6 +79,11 @@ struct writeback_control {
*/
struct swap_iocb **swap_plug;
+ /* internal fields used by the ->writepages implementation: */
+ struct folio_batch fbatch;
+ pgoff_t index;
+ int saved_err;
+
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *wb; /* wb this writeback is issued under */
struct inode *inode; /* inode being written out */
@@ -360,11 +367,12 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
bool wb_over_bg_thresh(struct bdi_writeback *wb);
+struct folio *writeback_iter(struct address_space *mapping,
+ struct writeback_control *wbc, struct folio *folio, int *error);
+
typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc,
void *data);
-void tag_pages_for_writeback(struct address_space *mapping,
- pgoff_t start, pgoff_t end);
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
void *data);
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 0b709f5bc65f..341aea490070 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -29,8 +29,8 @@ struct zswap_lruvec_state {
bool zswap_store(struct folio *folio);
bool zswap_load(struct folio *folio);
-void zswap_invalidate(int type, pgoff_t offset);
-void zswap_swapon(int type);
+void zswap_invalidate(swp_entry_t swp);
+int zswap_swapon(int type, unsigned long nr_pages);
void zswap_swapoff(int type);
void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
void zswap_lruvec_state_init(struct lruvec *lruvec);
@@ -50,8 +50,11 @@ static inline bool zswap_load(struct folio *folio)
return false;
}
-static inline void zswap_invalidate(int type, pgoff_t offset) {}
-static inline void zswap_swapon(int type) {}
+static inline void zswap_invalidate(swp_entry_t swp) {}
+static inline int zswap_swapon(int type, unsigned long nr_pages)
+{
+ return 0;
+}
static inline void zswap_swapoff(int type) {}
static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}