summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/bitops/instrumented-lock.h28
-rw-r--r--include/asm-generic/bitops/lock.h20
-rw-r--r--include/asm-generic/pgalloc.h7
-rw-r--r--include/linux/bootmem_info.h2
-rw-r--r--include/linux/buffer_head.h81
-rw-r--r--include/linux/cacheinfo.h1
-rw-r--r--include/linux/cgroup-defs.h5
-rw-r--r--include/linux/damon.h74
-rw-r--r--include/linux/dax.h10
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/gfp.h12
-rw-r--r--include/linux/hugetlb.h25
-rw-r--r--include/linux/hugetlb_cgroup.h11
-rw-r--r--include/linux/jbd2.h2
-rw-r--r--include/linux/memblock.h9
-rw-r--r--include/linux/memcontrol.h93
-rw-r--r--include/linux/memory-tiers.h41
-rw-r--r--include/linux/mempolicy.h58
-rw-r--r--include/linux/migrate.h4
-rw-r--r--include/linux/mm.h176
-rw-r--r--include/linux/mm_inline.h20
-rw-r--r--include/linux/mm_types.h49
-rw-r--r--include/linux/mmu_notifier.h9
-rw-r--r--include/linux/mmzone.h29
-rw-r--r--include/linux/page-flags.h19
-rw-r--r--include/linux/pagemap.h33
-rw-r--r--include/linux/percpu_counter.h30
-rw-r--r--include/linux/rmap.h4
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/linux/sched/coredump.h19
-rw-r--r--include/linux/sched/mm.h4
-rw-r--r--include/linux/sched/numa_balancing.h6
-rw-r--r--include/linux/shmem_fs.h16
-rw-r--r--include/linux/shrinker.h87
-rw-r--r--include/linux/userfaultfd_k.h28
-rw-r--r--include/linux/wait.h9
-rw-r--r--include/trace/events/damon.h45
-rw-r--r--include/trace/events/migrate.h24
-rw-r--r--include/trace/events/vmscan.h8
-rw-r--r--include/uapi/linux/fs.h59
-rw-r--r--include/uapi/linux/mempolicy.h2
-rw-r--r--include/uapi/linux/prctl.h3
-rw-r--r--include/uapi/linux/userfaultfd.h9
43 files changed, 837 insertions, 344 deletions
diff --git a/include/asm-generic/bitops/instrumented-lock.h b/include/asm-generic/bitops/instrumented-lock.h
index eb64bd4f11f3..542d3727ee4e 100644
--- a/include/asm-generic/bitops/instrumented-lock.h
+++ b/include/asm-generic/bitops/instrumented-lock.h
@@ -58,27 +58,25 @@ static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr)
return arch_test_and_set_bit_lock(nr, addr);
}
-#if defined(arch_clear_bit_unlock_is_negative_byte)
/**
- * clear_bit_unlock_is_negative_byte - Clear a bit in memory and test if bottom
- * byte is negative, for unlock.
- * @nr: the bit to clear
- * @addr: the address to start counting from
+ * xor_unlock_is_negative_byte - XOR a single byte in memory and test if
+ * it is negative, for unlock.
+ * @mask: Change the bits which are set in this mask.
+ * @addr: The address of the word containing the byte to change.
*
+ * Changes some of bits 0-6 in the word pointed to by @addr.
* This operation is atomic and provides release barrier semantics.
+ * Used to optimise some folio operations which are commonly paired
+ * with an unlock or end of writeback. Bit 7 is used as PG_waiters to
+ * indicate whether anybody is waiting for the unlock.
*
- * This is a bit of a one-trick-pony for the filemap code, which clears
- * PG_locked and tests PG_waiters,
+ * Return: Whether the top bit of the byte is set.
*/
-static inline bool
-clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
+static inline bool xor_unlock_is_negative_byte(unsigned long mask,
+ volatile unsigned long *addr)
{
kcsan_release();
- instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
- return arch_clear_bit_unlock_is_negative_byte(nr, addr);
+ instrument_atomic_write(addr, sizeof(long));
+ return arch_xor_unlock_is_negative_byte(mask, addr);
}
-/* Let everybody know we have it. */
-#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte
-#endif
-
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */
diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h
index 40913516e654..14d4ec8c5152 100644
--- a/include/asm-generic/bitops/lock.h
+++ b/include/asm-generic/bitops/lock.h
@@ -66,27 +66,15 @@ arch___clear_bit_unlock(unsigned int nr, volatile unsigned long *p)
raw_atomic_long_set_release((atomic_long_t *)p, old);
}
-/**
- * arch_clear_bit_unlock_is_negative_byte - Clear a bit in memory and test if bottom
- * byte is negative, for unlock.
- * @nr: the bit to clear
- * @addr: the address to start counting from
- *
- * This is a bit of a one-trick-pony for the filemap code, which clears
- * PG_locked and tests PG_waiters,
- */
-#ifndef arch_clear_bit_unlock_is_negative_byte
-static inline bool arch_clear_bit_unlock_is_negative_byte(unsigned int nr,
- volatile unsigned long *p)
+#ifndef arch_xor_unlock_is_negative_byte
+static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
+ volatile unsigned long *p)
{
long old;
- unsigned long mask = BIT_MASK(nr);
- p += BIT_WORD(nr);
- old = raw_atomic_long_fetch_andnot_release(mask, (atomic_long_t *)p);
+ old = raw_atomic_long_fetch_xor_release(mask, (atomic_long_t *)p);
return !!(old & BIT(7));
}
-#define arch_clear_bit_unlock_is_negative_byte arch_clear_bit_unlock_is_negative_byte
#endif
#include <asm-generic/bitops/instrumented-lock.h>
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index c75d4a753849..879e5f8aa5e9 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -169,6 +169,8 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
ptdesc = pagetable_alloc(gfp, 0);
if (!ptdesc)
return NULL;
+
+ pagetable_pud_ctor(ptdesc);
return ptdesc_address(ptdesc);
}
@@ -190,8 +192,11 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
{
+ struct ptdesc *ptdesc = virt_to_ptdesc(pud);
+
BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
- pagetable_free(virt_to_ptdesc(pud));
+ pagetable_pud_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
#ifndef __HAVE_ARCH_PUD_FREE
diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index e1a3c9c9754c..cffa38a73618 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -60,7 +60,7 @@ static inline void get_page_bootmem(unsigned long info, struct page *page,
static inline void free_bootmem_page(struct page *page)
{
- kmemleak_free_part(page_to_virt(page), PAGE_SIZE);
+ kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE);
free_reserved_page(page);
}
#endif
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 44e9de51eedf..5f23ee599889 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -198,13 +198,11 @@ void touch_buffer(struct buffer_head *bh);
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
unsigned long offset);
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
- bool retry);
+ gfp_t gfp);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
bool retry);
-void create_empty_buffers(struct page *, unsigned long,
- unsigned long b_state);
-void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
- unsigned long b_state);
+struct buffer_head *create_empty_buffers(struct folio *folio,
+ unsigned long blocksize, unsigned long b_state);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
void end_buffer_async_write(struct buffer_head *bh, int uptodate);
@@ -227,8 +225,8 @@ void __wait_on_buffer(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
unsigned size);
-struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block,
- unsigned size, gfp_t gfp);
+struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size);
@@ -338,17 +336,38 @@ sb_breadahead(struct super_block *sb, sector_t block)
__breadahead(sb->s_bdev, block, sb->s_blocksize);
}
-static inline struct buffer_head *
-sb_getblk(struct super_block *sb, sector_t block)
+static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
+ sector_t block, unsigned size)
{
- return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
+ gfp_t gfp;
+
+ gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+ gfp |= __GFP_NOFAIL;
+
+ return bdev_getblk(bdev, block, size, gfp);
}
+static inline struct buffer_head *__getblk(struct block_device *bdev,
+ sector_t block, unsigned size)
+{
+ gfp_t gfp;
-static inline struct buffer_head *
-sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
+ gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+ gfp |= __GFP_MOVABLE | __GFP_NOFAIL;
+
+ return bdev_getblk(bdev, block, size, gfp);
+}
+
+static inline struct buffer_head *sb_getblk(struct super_block *sb,
+ sector_t block)
{
- return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
+ return __getblk(sb->s_bdev, block, sb->s_blocksize);
+}
+
+static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb,
+ sector_t block, gfp_t gfp)
+{
+ return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp);
}
static inline struct buffer_head *
@@ -385,20 +404,6 @@ static inline void lock_buffer(struct buffer_head *bh)
__lock_buffer(bh);
}
-static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
- sector_t block,
- unsigned size)
-{
- return __getblk_gfp(bdev, block, size, 0);
-}
-
-static inline struct buffer_head *__getblk(struct block_device *bdev,
- sector_t block,
- unsigned size)
-{
- return __getblk_gfp(bdev, block, size, __GFP_MOVABLE);
-}
-
static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags)
{
if (!buffer_uptodate(bh) && trylock_buffer(bh)) {
@@ -450,6 +455,28 @@ __bread(struct block_device *bdev, sector_t block, unsigned size)
return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
}
+/**
+ * get_nth_bh - Get a reference on the n'th buffer after this one.
+ * @bh: The buffer to start counting from.
+ * @count: How many buffers to skip.
+ *
+ * This is primarily useful for finding the nth buffer in a folio; in
+ * that case you pass the head buffer and the byte offset in the folio
+ * divided by the block size. It can be used for other purposes, but
+ * it will wrap at the end of the folio rather than returning NULL or
+ * proceeding to the next folio for you.
+ *
+ * Return: The requested buffer with an elevated refcount.
+ */
+static inline __must_check
+struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count)
+{
+ while (count--)
+ bh = bh->b_this_page;
+ get_bh(bh);
+ return bh;
+}
+
bool block_dirty_folio(struct address_space *mapping, struct folio *folio);
#ifdef CONFIG_BUFFER_HEAD
diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
index a5cfd44fab45..d504eb4b49ab 100644
--- a/include/linux/cacheinfo.h
+++ b/include/linux/cacheinfo.h
@@ -73,6 +73,7 @@ struct cacheinfo {
struct cpu_cacheinfo {
struct cacheinfo *info_list;
+ unsigned int per_cpu_data_slice_size;
unsigned int num_levels;
unsigned int num_leaves;
bool cpu_map_populated;
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 265da00a1a8b..4a6b6b77ccb6 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -115,6 +115,11 @@ enum {
* Enable recursive subtree protection
*/
CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),
+
+ /*
+ * Enable hugetlb accounting for the memory controller.
+ */
+ CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
};
/* cftype->flags */
diff --git a/include/linux/damon.h b/include/linux/damon.h
index ae2664d1d5f1..ab2f17d9926b 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -40,9 +40,24 @@ struct damon_addr_range {
* @ar: The address range of the region.
* @sampling_addr: Address of the sample for the next access check.
* @nr_accesses: Access frequency of this region.
+ * @nr_accesses_bp: @nr_accesses in basis point (0.01%) that updated for
+ * each sampling interval.
* @list: List head for siblings.
* @age: Age of this region.
*
+ * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be
+ * increased for every &damon_attrs->sample_interval if an access to the region
+ * during the last sampling interval is found. The update of this field should
+ * not be done with direct access but with the helper function,
+ * damon_update_region_access_rate().
+ *
+ * @nr_accesses_bp is another representation of @nr_accesses in basis point
+ * (1 in 10,000) that updated for every &damon_attrs->sample_interval in a
+ * manner similar to moving sum. By the algorithm, this value becomes
+ * @nr_accesses * 10000 for every &struct damon_attrs->aggr_interval. This can
+ * be used when the aggregation interval is too huge and therefore cannot wait
+ * for it before getting the access monitoring results.
+ *
* @age is initially zero, increased for each aggregation interval, and reset
* to zero again if the access frequency is significantly changed. If two
* regions are merged into a new region, both @nr_accesses and @age of the new
@@ -52,6 +67,7 @@ struct damon_region {
struct damon_addr_range ar;
unsigned long sampling_addr;
unsigned int nr_accesses;
+ unsigned int nr_accesses_bp;
struct list_head list;
unsigned int age;
@@ -298,24 +314,24 @@ struct damos_access_pattern {
* struct damos - Represents a Data Access Monitoring-based Operation Scheme.
* @pattern: Access pattern of target regions.
* @action: &damo_action to be applied to the target regions.
+ * @apply_interval_us: The time between applying the @action.
* @quota: Control the aggressiveness of this scheme.
* @wmarks: Watermarks for automated (in)activation of this scheme.
* @filters: Additional set of &struct damos_filter for &action.
* @stat: Statistics of this scheme.
* @list: List head for siblings.
*
- * For each aggregation interval, DAMON finds regions which fit in the
+ * For each @apply_interval_us, DAMON finds regions which fit in the
* &pattern and applies &action to those. To avoid consuming too much
* CPU time or IO resources for the &action, &quota is used.
*
+ * If @apply_interval_us is zero, &damon_attrs->aggr_interval is used instead.
+ *
* To do the work only when needed, schemes can be activated for specific
* system situations using &wmarks. If all schemes that registered to the
* monitoring context are inactive, DAMON stops monitoring either, and just
* repeatedly checks the watermarks.
*
- * If all schemes that registered to a &struct damon_ctx are inactive, DAMON
- * stops monitoring and just repeatedly checks the watermarks.
- *
* Before applying the &action to a memory region, &struct damon_operations
* implementation could check pages of the region and skip &action to respect
* &filters
@@ -327,6 +343,14 @@ struct damos_access_pattern {
struct damos {
struct damos_access_pattern pattern;
enum damos_action action;
+ unsigned long apply_interval_us;
+/* private: internal use only */
+ /*
+ * number of sample intervals that should be passed before applying
+ * @action
+ */
+ unsigned long next_apply_sis;
+/* public: */
struct damos_quota quota;
struct damos_watermarks wmarks;
struct list_head filters;
@@ -472,13 +496,14 @@ struct damon_callback {
* regions.
*
* For each @sample_interval, DAMON checks whether each region is accessed or
- * not. It aggregates and keeps the access information (number of accesses to
- * each region) for @aggr_interval time. DAMON also checks whether the target
- * memory regions need update (e.g., by ``mmap()`` calls from the application,
- * in case of virtual memory monitoring) and applies the changes for each
- * @ops_update_interval. All time intervals are in micro-seconds.
- * Please refer to &struct damon_operations and &struct damon_callback for more
- * detail.
+ * not during the last @sample_interval. If such access is found, DAMON
+ * aggregates the information by increasing &damon_region->nr_accesses for
+ * @aggr_interval time. For each @aggr_interval, the count is reset. DAMON
+ * also checks whether the target memory regions need update (e.g., by
+ * ``mmap()`` calls from the application, in case of virtual memory monitoring)
+ * and applies the changes for each @ops_update_interval. All time intervals
+ * are in micro-seconds. Please refer to &struct damon_operations and &struct
+ * damon_callback for more detail.
*/
struct damon_attrs {
unsigned long sample_interval;
@@ -522,8 +547,18 @@ struct damon_ctx {
struct damon_attrs attrs;
/* private: internal use only */
- struct timespec64 last_aggregation;
- struct timespec64 last_ops_update;
+ /* number of sample intervals that passed since this context started */
+ unsigned long passed_sample_intervals;
+ /*
+ * number of sample intervals that should be passed before next
+ * aggregation
+ */
+ unsigned long next_aggregation_sis;
+ /*
+ * number of sample intervals that should be passed before next ops
+ * update
+ */
+ unsigned long next_ops_update_sis;
/* public: */
struct task_struct *kdamond;
@@ -608,6 +643,8 @@ void damon_add_region(struct damon_region *r, struct damon_target *t);
void damon_destroy_region(struct damon_region *r, struct damon_target *t);
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
unsigned int nr_ranges);
+void damon_update_region_access_rate(struct damon_region *r, bool accessed,
+ struct damon_attrs *attrs);
struct damos_filter *damos_new_filter(enum damos_filter_type type,
bool matching);
@@ -615,7 +652,9 @@ void damos_add_filter(struct damos *s, struct damos_filter *f);
void damos_destroy_filter(struct damos_filter *f);
struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
- enum damos_action action, struct damos_quota *quota,
+ enum damos_action action,
+ unsigned long apply_interval_us,
+ struct damos_quota *quota,
struct damos_watermarks *wmarks);
void damon_add_scheme(struct damon_ctx *ctx, struct damos *s);
void damon_destroy_scheme(struct damos *s);
@@ -642,6 +681,13 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx)
return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR;
}
+static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs)
+{
+ /* {aggr,sample}_interval are unsigned long, hence could overflow */
+ return min(attrs->aggr_interval / attrs->sample_interval,
+ (unsigned long)UINT_MAX);
+}
+
int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 22cd9902345d..b463502b16e1 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -159,8 +159,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
struct page *dax_layout_busy_page(struct address_space *mapping);
struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
-dax_entry_t dax_lock_page(struct page *page);
-void dax_unlock_page(struct page *page, dax_entry_t cookie);
+dax_entry_t dax_lock_folio(struct folio *folio);
+void dax_unlock_folio(struct folio *folio, dax_entry_t cookie);
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
unsigned long index, struct page **page);
void dax_unlock_mapping_entry(struct address_space *mapping,
@@ -182,14 +182,14 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping,
return -EOPNOTSUPP;
}
-static inline dax_entry_t dax_lock_page(struct page *page)
+static inline dax_entry_t dax_lock_folio(struct folio *folio)
{
- if (IS_DAX(page->mapping->host))
+ if (IS_DAX(folio->mapping->host))
return ~0UL;
return 0;
}
-static inline void dax_unlock_page(struct page *page, dax_entry_t cookie)
+static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
{
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c27c324ba58a..98b7a7a8c42e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -454,7 +454,7 @@ extern const struct address_space_operations empty_aops;
* It is also used to block modification of page cache contents through
* memory mappings.
* @gfp_mask: Memory allocation flags to use for allocating pages.
- * @i_mmap_writable: Number of VM_SHARED mappings.
+ * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
* @nr_thps: Number of THPs in the pagecache (non-shmem only).
* @i_mmap: Tree of private and shared mappings.
* @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
@@ -557,7 +557,7 @@ static inline int mapping_mapped(struct address_space *mapping)
/*
* Might pages of this file have been modified in userspace?
- * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap
+ * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
* marks vma as VM_SHARED if it is shared, and the file was opened for
* writing i.e. vma may be mprotected writable even if now readonly.
*
@@ -1270,7 +1270,7 @@ struct super_block {
const struct dentry_operations *s_d_op; /* default d_op for dentries */
- struct shrinker s_shrink; /* per-sb shrinker handle */
+ struct shrinker *s_shrink; /* per-sb shrinker handle */
/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 665f06675c83..de292a007138 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -8,6 +8,7 @@
#include <linux/topology.h>
struct vm_area_struct;
+struct mempolicy;
/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
@@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
#ifdef CONFIG_NUMA
struct page *alloc_pages(gfp_t gfp, unsigned int order);
-struct folio *folio_alloc(gfp_t gfp, unsigned order);
+struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+ struct mempolicy *mpol, pgoff_t ilx, int nid);
+struct folio *folio_alloc(gfp_t gfp, unsigned int order);
struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, bool hugepage);
#else
@@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{
return alloc_pages_node(numa_node_id(), gfp_mask, order);
}
+static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+ struct mempolicy *mpol, pgoff_t ilx, int nid)
+{
+ return alloc_pages(gfp, order);
+}
static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
{
return __folio_alloc_node(gfp, order, numa_node_id());
@@ -320,11 +328,13 @@ extern void page_frag_free(void *addr);
#define free_page(addr) free_pages((addr), 0)
void page_alloc_init_cpuhp(void);
+int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
void page_alloc_init_late(void);
+void setup_pcp_cacheinfo(void);
/*
* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 47d25a5e1933..d3acecc5db4b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -30,7 +30,7 @@ void free_huge_folio(struct folio *folio);
#ifdef CONFIG_HUGETLB_PAGE
-#include <linux/mempolicy.h>
+#include <linux/pagemap.h>
#include <linux/shm.h>
#include <asm/tlbflush.h>
@@ -280,6 +280,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long cp_flags);
bool is_hugetlb_entry_migration(pte_t pte);
+bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -544,7 +545,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
}
struct hugetlbfs_inode_info {
- struct shared_policy policy;
struct inode vfs_inode;
unsigned int seals;
};
@@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask);
-struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
- unsigned long address);
int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
@@ -844,6 +842,12 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
return huge_page_size(h) / 512;
}
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+ struct address_space *mapping, pgoff_t idx)
+{
+ return filemap_lock_folio(mapping, idx << huge_page_order(h));
+}
+
#include <asm/hugetlb.h>
#ifndef is_hugepage_only_range
@@ -1040,6 +1044,12 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio
return NULL;
}
+static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
+ struct address_space *mapping, pgoff_t idx)
+{
+ return NULL;
+}
+
static inline int isolate_or_dissolve_huge_page(struct page *page,
struct list_head *list)
{
@@ -1060,13 +1070,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
return NULL;
}
-static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
- struct vm_area_struct *vma,
- unsigned long address)
-{
- return NULL;
-}
-
static inline int __alloc_bootmem_huge_page(struct hstate *h)
{
return 0;
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 3d82d91f49ac..e5d64b8b59c2 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -22,13 +22,6 @@ struct resv_map;
struct file_region;
#ifdef CONFIG_CGROUP_HUGETLB
-/*
- * Minimum page order trackable by hugetlb cgroup.
- * At least 3 pages are necessary for all the tracking information.
- * The second tail page contains all of the hugetlb-specific fields.
- */
-#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE)
-
enum hugetlb_memory_event {
HUGETLB_MAX,
HUGETLB_NR_MEMORY_EVENTS,
@@ -68,8 +61,6 @@ static inline struct hugetlb_cgroup *
__hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd)
{
VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
- if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER)
- return NULL;
if (rsvd)
return folio->_hugetlb_cgroup_rsvd;
else
@@ -91,8 +82,6 @@ static inline void __set_hugetlb_cgroup(struct folio *folio,
struct hugetlb_cgroup *h_cg, bool rsvd)
{
VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
- if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER)
- return;
if (rsvd)
folio->_hugetlb_cgroup_rsvd = h_cg;
else
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 52772c826c86..6dcbb4eb80fb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -886,7 +886,7 @@ struct journal_s
* Journal head shrinker, reclaim buffer's journal head which
* has been written back.
*/
- struct shrinker j_shrinker;
+ struct shrinker *j_shrinker;
/**
* @j_checkpoint_jh_count:
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 1c1072e3ca06..ae3bde302f70 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -40,6 +40,8 @@ extern unsigned long long max_possible_pfn;
* via a driver, and never indicated in the firmware-provided memory map as
* system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the
* kernel resource tree.
+ * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are
+ * not initialized (only for reserved regions).
*/
enum memblock_flags {
MEMBLOCK_NONE = 0x0, /* No special request */
@@ -47,6 +49,7 @@ enum memblock_flags {
MEMBLOCK_MIRROR = 0x2, /* mirrored region */
MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */
+ MEMBLOCK_RSRV_NOINIT = 0x10, /* don't initialize struct pages */
};
/**
@@ -125,6 +128,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
+int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);
void memblock_free_all(void);
void memblock_free(void *ptr, size_t size);
@@ -259,6 +263,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
return m->flags & MEMBLOCK_NOMAP;
}
+static inline bool memblock_is_reserved_noinit(struct memblock_region *m)
+{
+ return m->flags & MEMBLOCK_RSRV_NOINIT;
+}
+
static inline bool memblock_is_driver_managed(struct memblock_region *m)
{
return m->flags & MEMBLOCK_DRIVER_MANAGED;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e4e24da16d2c..7bdcf3020d7a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,7 @@
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>
+#include <linux/shrinker.h>
struct mem_cgroup;
struct obj_cgroup;
@@ -88,17 +89,6 @@ struct mem_cgroup_reclaim_iter {
unsigned int generation;
};
-/*
- * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
- * shrinkers, which have elements charged to this memcg.
- */
-struct shrinker_info {
- struct rcu_head rcu;
- atomic_long_t *nr_deferred;
- unsigned long *map;
- int map_nr_max;
-};
-
struct lruvec_stats_percpu {
/* Local (CPU and cgroup) state */
long state[NR_VM_NODE_STAT_ITEMS];
@@ -153,7 +143,7 @@ struct mem_cgroup_threshold_ary {
/* Size of entries[] */
unsigned int size;
/* Array of thresholds */
- struct mem_cgroup_threshold entries[];
+ struct mem_cgroup_threshold entries[] __counted_by(size);
};
struct mem_cgroup_thresholds {
@@ -299,7 +289,13 @@ struct mem_cgroup {
#ifdef CONFIG_MEMCG_KMEM
int kmemcg_id;
- struct obj_cgroup __rcu *objcg;
+ /*
+ * memcg->objcg is wiped out as a part of the objcg repaprenting
+ * process. memcg->orig_objcg preserves a pointer (and a reference)
+ * to the original objcg until the end of live of memcg.
+ */
+ struct obj_cgroup __rcu *objcg;
+ struct obj_cgroup *orig_objcg;
/* list of inherited objcgs, protected by objcg_lock */
struct list_head objcg_list;
#endif
@@ -662,6 +658,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
page_counter_read(&memcg->memory);
}
+void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
+
int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
/**
@@ -686,6 +684,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
return __mem_cgroup_charge(folio, mm, gfp);
}
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ long nr_pages);
+
int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry);
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -713,6 +714,10 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
__mem_cgroup_uncharge_list(page_list);
}
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages);
+
+void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
+
void mem_cgroup_migrate(struct folio *old, struct folio *new);
/**
@@ -769,6 +774,8 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
+struct mem_cgroup *get_mem_cgroup_from_current(void);
+
struct lruvec *folio_lruvec_lock(struct folio *folio);
struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
@@ -1080,15 +1087,6 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
local_irq_restore(flags);
}
-static inline void count_memcg_page_event(struct page *page,
- enum vm_event_item idx)
-{
- struct mem_cgroup *memcg = page_memcg(page);
-
- if (memcg)
- count_memcg_events(memcg, idx, 1);
-}
-
static inline void count_memcg_folio_events(struct folio *folio,
enum vm_event_item idx, unsigned long nr)
{
@@ -1249,12 +1247,23 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
return false;
}
+static inline void mem_cgroup_commit_charge(struct folio *folio,
+ struct mem_cgroup *memcg)
+{
+}
+
static inline int mem_cgroup_charge(struct folio *folio,
struct mm_struct *mm, gfp_t gfp)
{
return 0;
}
+static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
+ gfp_t gfp, long nr_pages)
+{
+ return 0;
+}
+
static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
{
@@ -1273,6 +1282,16 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
{
}
+static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+}
+
+static inline void mem_cgroup_replace_folio(struct folio *old,
+ struct folio *new)
+{
+}
+
static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
}
@@ -1310,6 +1329,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
return NULL;
}
+static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+ return NULL;
+}
+
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
{
@@ -1565,11 +1589,6 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg,
{
}
-static inline void count_memcg_page_event(struct page *page,
- int idx)
-{
-}
-
static inline void count_memcg_folio_events(struct folio *folio,
enum vm_event_item idx, unsigned long nr)
{
@@ -1763,9 +1782,27 @@ bool mem_cgroup_kmem_disabled(void);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);
-struct obj_cgroup *get_obj_cgroup_from_current(void);
+/*
+ * The returned objcg pointer is safe to use without additional
+ * protection within a scope. The scope is defined either by
+ * the current task (similar to the "current" global variable)
+ * or by set_active_memcg() pair.
+ * Please, use obj_cgroup_get() to get a reference if the pointer
+ * needs to be used outside of the local scope.
+ */
+struct obj_cgroup *current_obj_cgroup(void);
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);
+static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+ struct obj_cgroup *objcg = current_obj_cgroup();
+
+ if (objcg)
+ obj_cgroup_get(objcg);
+
+ return objcg;
+}
+
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 437441cdf78f..1e39d27bee41 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -6,6 +6,7 @@
#include <linux/nodemask.h>
#include <linux/kref.h>
#include <linux/mmzone.h>
+#include <linux/notifier.h>
/*
* Each tier cover a abstrace distance chunk size of 128
*/
@@ -22,7 +23,9 @@
struct memory_tier;
struct memory_dev_type {
/* list of memory types that are part of same tier as this type */
- struct list_head tier_sibiling;
+ struct list_head tier_sibling;
+ /* list of memory types that are managed by one driver */
+ struct list_head list;
/* abstract distance for this specific memory type */
int adistance;
/* Nodes of same abstract distance */
@@ -30,12 +33,21 @@ struct memory_dev_type {
struct kref kref;
};
+struct node_hmem_attrs;
+
#ifdef CONFIG_NUMA
extern bool numa_demotion_enabled;
+extern struct memory_dev_type *default_dram_type;
struct memory_dev_type *alloc_memory_type(int adistance);
void put_memory_type(struct memory_dev_type *memtype);
void init_node_memory_type(int node, struct memory_dev_type *default_type);
void clear_node_memory_type(int node, struct memory_dev_type *memtype);
+int register_mt_adistance_algorithm(struct notifier_block *nb);
+int unregister_mt_adistance_algorithm(struct notifier_block *nb);
+int mt_calc_adistance(int node, int *adist);
+int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+ const char *source);
+int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist);
#ifdef CONFIG_MIGRATION
int next_demotion_node(int node);
void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -60,6 +72,7 @@ static inline bool node_is_toptier(int node)
#else
#define numa_demotion_enabled false
+#define default_dram_type NULL
/*
* CONFIG_NUMA implementation returns non NULL error.
*/
@@ -97,5 +110,31 @@ static inline bool node_is_toptier(int node)
{
return true;
}
+
+static inline int register_mt_adistance_algorithm(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline int unregister_mt_adistance_algorithm(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline int mt_calc_adistance(int node, int *adist)
+{
+ return NOTIFY_DONE;
+}
+
+static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf,
+ const char *source)
+{
+ return -EIO;
+}
+
+static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist)
+{
+ return -EIO;
+}
#endif /* CONFIG_NUMA */
#endif /* _LINUX_MEMORY_TIERS_H */
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..931b118336f4 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -17,6 +17,8 @@
struct mm_struct;
+#define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */
+
#ifdef CONFIG_NUMA
/*
@@ -89,8 +91,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
return pol;
}
-#define vma_policy(vma) ((vma)->vm_policy)
-
static inline void mpol_get(struct mempolicy *pol)
{
if (pol)
@@ -107,35 +107,30 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
/*
* Tree of shared policies for a shared memory region.
- * Maintain the policies in a pseudo mm that contains vmas. The vmas
- * carry the policy. As a special twist the pseudo mm is indexed in pages, not
- * bytes, so that we can work with shared memory segments bigger than
- * unsigned long.
*/
-
-struct sp_node {
- struct rb_node nd;
- unsigned long start, end;
- struct mempolicy *policy;
-};
-
struct shared_policy {
struct rb_root root;
rwlock_t lock;
};
+struct sp_node {
+ struct rb_node nd;
+ pgoff_t start, end;
+ struct mempolicy *policy;
+};
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
-int mpol_set_shared_policy(struct shared_policy *info,
- struct vm_area_struct *vma,
- struct mempolicy *new);
-void mpol_free_shared_policy(struct shared_policy *p);
+int mpol_set_shared_policy(struct shared_policy *sp,
+ struct vm_area_struct *vma, struct mempolicy *mpol);
+void mpol_free_shared_policy(struct shared_policy *sp);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
- unsigned long idx);
+ pgoff_t idx);
struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
- unsigned long addr);
+ unsigned long addr, pgoff_t *ilx);
+struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr, int order, pgoff_t *ilx);
bool vma_policy_mof(struct vm_area_struct *vma);
extern void numa_default_policy(void);
@@ -149,8 +144,6 @@ extern int huge_node(struct vm_area_struct *vma,
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask);
-extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
-
extern unsigned int mempolicy_slab_node(void);
extern enum zone_type policy_zone;
@@ -174,7 +167,7 @@ extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);
-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long);
extern void mpol_put_task_policy(struct task_struct *);
static inline bool mpol_is_preferred_many(struct mempolicy *pol)
@@ -188,12 +181,17 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
struct mempolicy {};
+static inline struct mempolicy *get_task_policy(struct task_struct *p)
+{
+ return NULL;
+}
+
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
return true;
}
-static inline void mpol_put(struct mempolicy *p)
+static inline void mpol_put(struct mempolicy *pol)
{
}
@@ -212,17 +210,22 @@ static inline void mpol_shared_policy_init(struct shared_policy *sp,
{
}
-static inline void mpol_free_shared_policy(struct shared_policy *p)
+static inline void mpol_free_shared_policy(struct shared_policy *sp)
{
}
static inline struct mempolicy *
-mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
{
return NULL;
}
-#define vma_policy(vma) NULL
+static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr, int order, pgoff_t *ilx)
+{
+ *ilx = 0;
+ return NULL;
+}
static inline int
vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
@@ -278,7 +281,8 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
}
#endif
-static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+static inline int mpol_misplaced(struct folio *folio,
+ struct vm_area_struct *vma,
unsigned long address)
{
return -1; /* no node preference */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 711dd9412561..2ce13e8a309b 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -142,10 +142,10 @@ const struct movable_operations *page_movable_ops(struct page *page)
}
#ifdef CONFIG_NUMA_BALANCING
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
int node);
#else
-static inline int migrate_misplaced_page(struct page *page,
+static inline int migrate_misplaced_folio(struct folio *folio,
struct vm_area_struct *vma, int node)
{
return -EAGAIN; /* can't migrate now */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ba896e946651..418d26608ece 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -617,7 +617,7 @@ struct vm_operations_struct {
* policy.
*/
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
- unsigned long addr);
+ unsigned long addr, pgoff_t *ilx);
#endif
/*
* Called by vm_normal_page() for special PTEs to find the
@@ -935,6 +935,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & VM_ACCESS_FLAGS;
}
+static inline bool is_shared_maywrite(vm_flags_t vm_flags)
+{
+ return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
+ (VM_SHARED | VM_MAYWRITE);
+}
+
+static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
+{
+ return is_shared_maywrite(vma->vm_flags);
+}
+
static inline
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
{
@@ -1335,7 +1346,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
struct page *page, unsigned int nr, unsigned long addr);
vm_fault_t finish_fault(struct vm_fault *vmf);
-vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif
/*
@@ -1684,26 +1694,26 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
- return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK);
+ return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
{
- return page->_last_cpupid;
+ return folio->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
{
- return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
+ return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}
-extern int page_cpupid_xchg_last(struct page *page, int cpupid);
+int folio_xchg_last_cpupid(struct folio *folio, int cpupid);
static inline void page_cpupid_reset_last(struct page *page)
{
@@ -1711,11 +1721,12 @@ static inline void page_cpupid_reset_last(struct page *page)
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
{
int last_time;
- last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
+ last_time = folio_xchg_last_cpupid(folio,
+ time >> PAGE_ACCESS_TIME_BUCKETS);
return last_time << PAGE_ACCESS_TIME_BUCKETS;
}
@@ -1729,19 +1740,19 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
}
}
#else /* !CONFIG_NUMA_BALANCING */
-static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
+static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
- return page_to_nid(page); /* XXX */
+ return folio_nid(folio); /* XXX */
}
-static inline int xchg_page_access_time(struct page *page, int time)
+static inline int folio_xchg_access_time(struct folio *folio, int time)
{
return 0;
}
-static inline int page_cpupid_last(struct page *page)
+static inline int folio_last_cpupid(struct folio *folio)
{
- return page_to_nid(page); /* XXX */
+ return folio_nid(folio); /* XXX */
}
static inline int cpupid_to_nid(int cpupid)
@@ -2325,6 +2336,8 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
+struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
@@ -2411,8 +2424,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags);
-extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
- void *buf, int len, unsigned int gup_flags);
long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@@ -2423,6 +2434,9 @@ long pin_user_pages_remote(struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
int *locked);
+/*
+ * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
+ */
static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
unsigned long addr,
int gup_flags,
@@ -2430,12 +2444,15 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
{
struct page *page;
struct vm_area_struct *vma;
- int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
+ int got;
+
+ if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
+ return ERR_PTR(-EINVAL);
+
+ got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);
if (got < 0)
return ERR_PTR(got);
- if (got == 0)
- return NULL;
vma = vma_lookup(mm, addr);
if (WARN_ON_ONCE(!vma)) {
@@ -2478,7 +2495,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
extern unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
- bool need_rmap_locks);
+ bool need_rmap_locks, bool for_stack);
/*
* Flags used by change_protection(). For now we make it a bitmap so
@@ -2626,14 +2643,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
*maxrss = hiwater_rss;
}
-#if defined(SPLIT_RSS_COUNTING)
-void sync_mm_rss(struct mm_struct *mm);
-#else
-static inline void sync_mm_rss(struct mm_struct *mm)
-{
-}
-#endif
-
#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
@@ -3055,6 +3064,22 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
return ptl;
}
+static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
+{
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ __folio_set_pgtable(folio);
+ lruvec_stat_add_folio(folio, NR_PAGETABLE);
+}
+
+static inline void pagetable_pud_dtor(struct ptdesc *ptdesc)
+{
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ __folio_clear_pgtable(folio);
+ lruvec_stat_sub_folio(folio, NR_PAGETABLE);
+}
+
extern void __init pagecache_init(void);
extern void free_initmem(void);
@@ -3219,22 +3244,73 @@ extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct *next);
extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long start, unsigned long end, pgoff_t pgoff);
-extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi,
- struct mm_struct *, struct vm_area_struct *prev, unsigned long addr,
- unsigned long end, unsigned long vm_flags, struct anon_vma *,
- struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx,
- struct anon_vma_name *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
-extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
- unsigned long addr, int new_below);
-extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *,
- unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void unlink_file_vma(struct vm_area_struct *);
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
+struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long vm_flags,
+ struct mempolicy *policy,
+ struct vm_userfaultfd_ctx uffd_ctx,
+ struct anon_vma_name *anon_name);
+
+/* We are about to modify the VMA's flags. */
+static inline struct vm_area_struct
+*vma_modify_flags(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long new_flags)
+{
+ return vma_modify(vmi, prev, vma, start, end, new_flags,
+ vma_policy(vma), vma->vm_userfaultfd_ctx,
+ anon_vma_name(vma));
+}
+
+/* We are about to modify the VMA's flags and/or anon_name. */
+static inline struct vm_area_struct
+*vma_modify_flags_name(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long new_flags,
+ struct anon_vma_name *new_name)
+{
+ return vma_modify(vmi, prev, vma, start, end, new_flags,
+ vma_policy(vma), vma->vm_userfaultfd_ctx, new_name);
+}
+
+/* We are about to modify the VMA's memory policy. */
+static inline struct vm_area_struct
+*vma_modify_policy(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct mempolicy *new_pol)
+{
+ return vma_modify(vmi, prev, vma, start, end, vma->vm_flags,
+ new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+}
+
+/* We are about to modify the VMA's flags and/or uffd context. */
+static inline struct vm_area_struct
+*vma_modify_flags_uffd(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long new_flags,
+ struct vm_userfaultfd_ctx new_ctx)
+{
+ return vma_modify(vmi, prev, vma, start, end, new_flags,
+ vma_policy(vma), new_ctx, anon_vma_name(vma));
+}
static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
@@ -3997,25 +4073,26 @@ static inline void mem_dump_obj(void *object) {}
#endif
/**
- * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it
+ * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and
+ * handle them.
* @seals: the seals to check
* @vma: the vma to operate on
*
- * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on
- * the vma flags. Return 0 if check pass, or <0 for errors.
+ * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper
+ * check/handling on the vma flags. Return 0 if check pass, or <0 for errors.
*/
-static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
+static inline int seal_check_write(int seals, struct vm_area_struct *vma)
{
- if (seals & F_SEAL_FUTURE_WRITE) {
+ if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
/*
* New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * "future write" seal active.
+ * write seals are active.
*/
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
return -EPERM;
/*
- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
+ * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as
* MAP_SHARED and read-only, take care to not allow mprotect to
* revert protections on such mappings. Do this only for shared
* mappings. For private mappings, don't need to mask
@@ -4059,4 +4136,11 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end)
#endif
+static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
+{
+ phys_addr_t paddr = pfn << PAGE_SHIFT;
+
+ return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
+}
+
#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8148b30a9df1..9ae7def16cb2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -4,6 +4,7 @@
#include <linux/atomic.h>
#include <linux/huge_mm.h>
+#include <linux/mm_types.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/userfaultfd_k.h>
@@ -352,15 +353,6 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
}
#ifdef CONFIG_ANON_VMA_NAME
-/*
- * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
- * either keep holding the lock while using the returned pointer or it should
- * raise anon_vma_name refcount before releasing the lock.
- */
-extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
-extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
-extern void anon_vma_name_free(struct kref *kref);
-
/* mmap_lock should be read-locked */
static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
{
@@ -415,16 +407,6 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
}
#else /* CONFIG_ANON_VMA_NAME */
-static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
-{
- return NULL;
-}
-
-static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
-{
- return NULL;
-}
-
static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4be8e310b189..957ce38768b2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -188,6 +188,10 @@ struct page {
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+ int _last_cpupid;
+#endif
+
#ifdef CONFIG_KMSAN
/*
* KMSAN metadata for this page:
@@ -199,10 +203,6 @@ struct page {
struct page *kmsan_shadow;
struct page *kmsan_origin;
#endif
-
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
- int _last_cpupid;
-#endif
} _struct_page_alignment;
/*
@@ -261,6 +261,8 @@ typedef struct {
* @_refcount: Do not access this member directly. Use folio_ref_count()
* to find how many references there are to this folio.
* @memcg_data: Memory Control Group data.
+ * @virtual: Virtual address in the kernel direct map.
+ * @_last_cpupid: IDs of last CPU and last process that accessed the folio.
* @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
* @_nr_pages_mapped: Do not use directly, call folio_mapcount().
* @_pincount: Do not use directly, call folio_maybe_dma_pinned().
@@ -307,6 +309,12 @@ struct folio {
#ifdef CONFIG_MEMCG
unsigned long memcg_data;
#endif
+#if defined(WANT_PAGE_VIRTUAL)
+ void *virtual;
+#endif
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+ int _last_cpupid;
+#endif
/* private: the union with struct page is transitional */
};
struct page page;
@@ -362,6 +370,12 @@ FOLIO_MATCH(_refcount, _refcount);
#ifdef CONFIG_MEMCG
FOLIO_MATCH(memcg_data, memcg_data);
#endif
+#if defined(WANT_PAGE_VIRTUAL)
+FOLIO_MATCH(virtual, virtual);
+#endif
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+FOLIO_MATCH(_last_cpupid, _last_cpupid);
+#endif
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl) \
static_assert(offsetof(struct folio, fl) == \
@@ -535,6 +549,27 @@ struct anon_vma_name {
char name[];
};
+#ifdef CONFIG_ANON_VMA_NAME
+/*
+ * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
+ * either keep holding the lock while using the returned pointer or it should
+ * raise anon_vma_name refcount before releasing the lock.
+ */
+struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
+struct anon_vma_name *anon_vma_name_alloc(const char *name);
+void anon_vma_name_free(struct kref *kref);
+#else /* CONFIG_ANON_VMA_NAME */
+static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
+{
+ return NULL;
+}
+
+static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
+{
+ return NULL;
+}
+#endif
+
struct vma_lock {
struct rw_semaphore lock;
};
@@ -678,6 +713,12 @@ struct vm_area_struct {
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
+#ifdef CONFIG_NUMA
+#define vma_policy(vma) ((vma)->vm_policy)
+#else
+#define vma_policy(vma) NULL
+#endif
+
#ifdef CONFIG_SCHED_MM_CID
struct mm_cid {
u64 time;
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 6e3c857606f1..f349e08a9dfe 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -459,7 +459,14 @@ mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}
-static inline int
+/*
+ * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it
+ * can return an error if a notifier can't proceed without blocking, in which
+ * case you're not allowed to modify PTEs in the specified range.
+ *
+ * This is mainly intended for OOM handling.
+ */
+static inline int __must_check
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
int ret = 0;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4106fbc5b4b3..3c25226beeed 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -639,8 +639,6 @@ struct lruvec {
#endif
};
-/* Isolate unmapped pages */
-#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
/* Isolate for asynchronous migration */
#define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4)
/* Isolate unevictable pages */
@@ -676,15 +674,34 @@ enum zone_watermarks {
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
+/*
+ * Flags used in pcp->flags field.
+ *
+ * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
+ * previous page freeing. To avoid to drain PCP for an accident
+ * high-order page freeing.
+ *
+ * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
+ * draining PCP for consecutive high-order pages freeing without
+ * allocation if data cache slice of CPU is large enough. To reduce
+ * zone lock contention and keep cache-hot pages reusing.
+ */
+#define PCPF_PREV_FREE_HIGH_ORDER BIT(0)
+#define PCPF_FREE_HIGH_BATCH BIT(1)
+
struct per_cpu_pages {
spinlock_t lock; /* Protects lists field */
int count; /* number of pages in the list */
int high; /* high watermark, emptying needed */
+ int high_min; /* min high watermark */
+ int high_max; /* max high watermark */
int batch; /* chunk size for buddy add/remove */
- short free_factor; /* batch scaling factor during free */
+ u8 flags; /* protected by pcp->lock */
+ u8 alloc_factor; /* batch scaling factor during allocate */
#ifdef CONFIG_NUMA
- short expire; /* When 0, remote pagesets are drained */
+ u8 expire; /* When 0, remote pagesets are drained */
#endif
+ short free_count; /* consecutive free count */
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[NR_PCP_LISTS];
@@ -837,7 +854,8 @@ struct zone {
* the high and batch values are copied to individual pagesets for
* faster access
*/
- int pageset_high;
+ int pageset_high_min;
+ int pageset_high_max;
int pageset_batch;
#ifndef CONFIG_SPARSEMEM
@@ -998,6 +1016,7 @@ enum zone_flags {
* Cleared when kswapd is woken.
*/
ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */
+ ZONE_BELOW_HIGH, /* zone is below high watermark. */
};
static inline unsigned long zone_managed_pages(struct zone *zone)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5c02720c53a5..a88e64acebfe 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -693,6 +693,25 @@ TESTPAGEFLAG_FALSE(Ksm, ksm)
u64 stable_page_flags(struct page *page);
/**
+ * folio_xor_flags_has_waiters - Change some folio flags.
+ * @folio: The folio.
+ * @mask: Bits set in this word will be changed.
+ *
+ * This must only be used for flags which are changed with the folio
+ * lock held. For example, it is unsafe to use for PG_dirty as that
+ * can be set without the folio lock held. It can also only be used
+ * on flags which are in the range 0-6 as some of the implementations
+ * only affect those bits.
+ *
+ * Return: Whether there are tasks waiting on the folio.
+ */
+static inline bool folio_xor_flags_has_waiters(struct folio *folio,
+ unsigned long mask)
+{
+ return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0));
+}
+
+/**
* folio_test_uptodate - Is this folio up to date?
* @folio: The folio.
*
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 351c3b7f93a1..bcc1ea44b4e8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -789,9 +789,6 @@ static inline pgoff_t folio_next_index(struct folio *folio)
*/
static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
{
- /* HugeTLBfs indexes the page cache in units of hpage_size */
- if (folio_test_hugetlb(folio))
- return &folio->page;
return folio_page(folio, index & (folio_nr_pages(folio) - 1));
}
@@ -807,9 +804,6 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
*/
static inline bool folio_contains(struct folio *folio, pgoff_t index)
{
- /* HugeTLBfs indexes the page cache in units of hpage_size */
- if (folio_test_hugetlb(folio))
- return folio->index == index;
return index - folio_index(folio) < folio_nr_pages(folio);
}
@@ -867,10 +861,9 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping,
}
/*
- * Get index of the page within radix-tree (but not for hugetlb pages).
- * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE)
+ * Get the offset in PAGE_SIZE (even for hugetlb pages).
*/
-static inline pgoff_t page_to_index(struct page *page)
+static inline pgoff_t page_to_pgoff(struct page *page)
{
struct page *head;
@@ -885,19 +878,6 @@ static inline pgoff_t page_to_index(struct page *page)
return head->index + page - head;
}
-extern pgoff_t hugetlb_basepage_index(struct page *page);
-
-/*
- * Get the offset in PAGE_SIZE (even for hugetlb pages).
- * (TODO: hugetlb pages should have ->index in PAGE_SIZE)
- */
-static inline pgoff_t page_to_pgoff(struct page *page)
-{
- if (unlikely(PageHuge(page)))
- return hugetlb_basepage_index(page);
- return page_to_index(page);
-}
-
/*
* Return byte-offset into filesystem object for page.
*/
@@ -934,24 +914,16 @@ static inline loff_t folio_file_pos(struct folio *folio)
/*
* Get the offset in PAGE_SIZE (even for hugetlb folios).
- * (TODO: hugetlb folios should have ->index in PAGE_SIZE)
*/
static inline pgoff_t folio_pgoff(struct folio *folio)
{
- if (unlikely(folio_test_hugetlb(folio)))
- return hugetlb_basepage_index(&folio->page);
return folio->index;
}
-extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
- unsigned long address);
-
static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
unsigned long address)
{
pgoff_t pgoff;
- if (unlikely(is_vm_hugetlb_page(vma)))
- return linear_hugepage_index(vma, address);
pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
return pgoff;
@@ -1129,6 +1101,7 @@ static inline void wait_on_page_locked(struct page *page)
folio_wait_locked(page_folio(page));
}
+void folio_end_read(struct folio *folio, bool success);
void wait_on_page_writeback(struct page *page);
void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index d01351b1526f..3a44dd1e33d2 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -57,6 +57,8 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
+bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
+ s64 amount, s32 batch);
void percpu_counter_sync(struct percpu_counter *fbc);
static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
@@ -69,6 +71,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}
+static inline bool
+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
+{
+ return __percpu_counter_limited_add(fbc, limit, amount,
+ percpu_counter_batch);
+}
+
/*
* With percpu_counter_add_local() and percpu_counter_sub_local(), counts
* are accumulated in local per cpu counter and not in fbc->count until
@@ -185,6 +194,27 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
local_irq_restore(flags);
}
+static inline bool
+percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
+{
+ unsigned long flags;
+ bool good = false;
+ s64 count;
+
+ if (amount == 0)
+ return true;
+
+ local_irq_save(flags);
+ count = fbc->count + amount;
+ if ((amount > 0 && count <= limit) ||
+ (amount < 0 && count >= limit)) {
+ fbc->count = count;
+ good = true;
+ }
+ local_irq_restore(flags);
+ return good;
+}
+
/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 51cc21ebb568..b26fe858fd44 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -189,7 +189,7 @@ typedef int __bitwise rmap_t;
/*
* rmap interfaces called when adding or removing pte of page
*/
-void page_move_anon_rmap(struct page *, struct vm_area_struct *);
+void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
void page_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long address, rmap_t flags);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
@@ -203,7 +203,7 @@ void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
void page_remove_rmap(struct page *, struct vm_area_struct *,
bool compound);
-void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
+void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *,
unsigned long address, rmap_t flags);
void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
unsigned long address);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 12ec109ce8c9..b49ca40f6335 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1448,6 +1448,10 @@ struct task_struct {
struct mem_cgroup *active_memcg;
#endif
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *objcg;
+#endif
+
#ifdef CONFIG_BLK_CGROUP
struct gendisk *throttle_disk;
#endif
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 0ee96ea7a0e9..02f5090ffea2 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
+#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */
#define MMF_MULTIPROCESS 26 /* mm is shared between processes */
/*
@@ -85,10 +86,22 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_HAS_MDWE 28
#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)
-#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
+
+#define MMF_HAS_MDWE_NO_INHERIT 29
+
+#define MMF_VM_MERGE_ANY 30
+#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY)
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
- MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
+ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
+ MMF_VM_MERGE_ANY_MASK)
+
+static inline unsigned long mmf_init_flags(unsigned long flags)
+{
+ if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
+ flags &= ~((1UL << MMF_HAS_MDWE) |
+ (1UL << MMF_HAS_MDWE_NO_INHERIT));
+ return flags & MMF_INIT_MASK;
+}
-#define MMF_VM_MERGE_ANY 29
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 8d89c8c4fac1..9a19f1b42f64 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -403,6 +403,10 @@ DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
* __GFP_ACCOUNT allocations till the end of the scope will be charged to the
* given memcg.
*
+ * Please, make sure that caller has a reference to the passed memcg structure,
+ * so its lifetime is guaranteed to exceed the scope between two
+ * set_active_memcg() calls.
+ *
* NOTE: This function can nest. Users must save the return value and
* reset the previous value after their own charging scope is over.
*/
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index b69afb8630db..52b22c5c396d 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -30,8 +30,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
extern pid_t task_numa_group_id(struct task_struct *p);
extern void set_numabalancing_state(bool enabled);
extern void task_numa_free(struct task_struct *p, bool final);
-extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
- int src_nid, int dst_cpu);
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
+ int src_nid, int dst_cpu);
#else
static inline void task_numa_fault(int last_node, int node, int pages,
int flags)
@@ -48,7 +48,7 @@ static inline void task_numa_free(struct task_struct *p, bool final)
{
}
static inline bool should_numa_migrate_memory(struct task_struct *p,
- struct page *page, int src_nid, int dst_cpu)
+ struct folio *folio, int src_nid, int dst_cpu)
{
return true;
}
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 6b0c626620f5..2caa6b86106a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -23,18 +23,22 @@ struct shmem_inode_info {
unsigned long flags;
unsigned long alloced; /* data pages alloced to file */
unsigned long swapped; /* subtotal assigned to swap */
- pgoff_t fallocend; /* highest fallocate endindex */
- struct list_head shrinklist; /* shrinkable hpage inodes */
- struct list_head swaplist; /* chain of maybes on swap */
+ union {
+ struct offset_ctx dir_offsets; /* stable directory offsets */
+ struct {
+ struct list_head shrinklist; /* shrinkable hpage inodes */
+ struct list_head swaplist; /* chain of maybes on swap */
+ };
+ };
+ struct timespec64 i_crtime; /* file creation time */
struct shared_policy policy; /* NUMA memory alloc policy */
struct simple_xattrs xattrs; /* list of xattrs */
+ pgoff_t fallocend; /* highest fallocate endindex */
+ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */
atomic_t stop_eviction; /* hold when working on inode */
- struct timespec64 i_crtime; /* file creation time */
- unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */
#ifdef CONFIG_TMPFS_QUOTA
struct dquot *i_dquot[MAXQUOTAS];
#endif
- struct offset_ctx dir_offsets; /* stable entry offsets */
struct inode vfs_inode;
};
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 224293b2dd06..1a00be90d93a 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -4,6 +4,25 @@
#include <linux/atomic.h>
#include <linux/types.h>
+#include <linux/refcount.h>
+#include <linux/completion.h>
+
+#define SHRINKER_UNIT_BITS BITS_PER_LONG
+
+/*
+ * Bitmap and deferred work of shrinker::id corresponding to memcg-aware
+ * shrinkers, which have elements charged to the memcg.
+ */
+struct shrinker_info_unit {
+ atomic_long_t nr_deferred[SHRINKER_UNIT_BITS];
+ DECLARE_BITMAP(map, SHRINKER_UNIT_BITS);
+};
+
+struct shrinker_info {
+ struct rcu_head rcu;
+ int map_nr_max;
+ struct shrinker_info_unit *unit[];
+};
/*
* This struct is used to pass information from page reclaim to the shrinkers.
@@ -70,6 +89,19 @@ struct shrinker {
int seeks; /* seeks to recreate an obj */
unsigned flags;
+ /*
+ * The reference count of this shrinker. Registered shrinker have an
+ * initial refcount of 1, then the lookup operations are now allowed
+ * to use it via shrinker_try_get(). Later in the unregistration step,
+ * the initial refcount will be discarded, and will free the shrinker
+ * asynchronously via RCU after its refcount reaches 0.
+ */
+ refcount_t refcount;
+ struct completion done; /* use to wait for refcount to reach 0 */
+ struct rcu_head rcu;
+
+ void *private_data;
+
/* These are for internal use */
struct list_head list;
#ifdef CONFIG_MEMCG
@@ -86,48 +118,39 @@ struct shrinker {
};
#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
-/* Flags */
-#define SHRINKER_REGISTERED (1 << 0)
-#define SHRINKER_NUMA_AWARE (1 << 1)
-#define SHRINKER_MEMCG_AWARE (1 << 2)
+/* Internal flags */
+#define SHRINKER_REGISTERED BIT(0)
+#define SHRINKER_ALLOCATED BIT(1)
+
+/* Flags for users to use */
+#define SHRINKER_NUMA_AWARE BIT(2)
+#define SHRINKER_MEMCG_AWARE BIT(3)
/*
* It just makes sense when the shrinker is also MEMCG_AWARE for now,
* non-MEMCG_AWARE shrinker should not have this flag set.
*/
-#define SHRINKER_NONSLAB (1 << 3)
+#define SHRINKER_NONSLAB BIT(4)
-extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker,
- const char *fmt, ...);
-extern void register_shrinker_prepared(struct shrinker *shrinker);
-extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker,
- const char *fmt, ...);
-extern void unregister_shrinker(struct shrinker *shrinker);
-extern void free_prealloced_shrinker(struct shrinker *shrinker);
-extern void synchronize_shrinkers(void);
+__printf(2, 3)
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
+void shrinker_register(struct shrinker *shrinker);
+void shrinker_free(struct shrinker *shrinker);
-#ifdef CONFIG_SHRINKER_DEBUG
-extern int shrinker_debugfs_add(struct shrinker *shrinker);
-extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
- int *debugfs_id);
-extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
- int debugfs_id);
-extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
- const char *fmt, ...);
-#else /* CONFIG_SHRINKER_DEBUG */
-static inline int shrinker_debugfs_add(struct shrinker *shrinker)
-{
- return 0;
-}
-static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
- int *debugfs_id)
+static inline bool shrinker_try_get(struct shrinker *shrinker)
{
- *debugfs_id = -1;
- return NULL;
+ return refcount_inc_not_zero(&shrinker->refcount);
}
-static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
- int debugfs_id)
+
+static inline void shrinker_put(struct shrinker *shrinker)
{
+ if (refcount_dec_and_test(&shrinker->refcount))
+ complete(&shrinker->done);
}
+
+#ifdef CONFIG_SHRINKER_DEBUG
+extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker,
+ const char *fmt, ...);
+#else /* CONFIG_SHRINKER_DEBUG */
static inline __printf(2, 3)
int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
{
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ac8c6854097c..f2dc19f40d05 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -161,11 +161,22 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
}
static inline bool vma_can_userfault(struct vm_area_struct *vma,
- unsigned long vm_flags)
+ unsigned long vm_flags,
+ bool wp_async)
{
+ vm_flags &= __VM_UFFD_FLAGS;
+
if ((vm_flags & VM_UFFD_MINOR) &&
(!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
return false;
+
+ /*
+ * If wp async enabled, and WP is the only mode enabled, allow any
+ * memory type.
+ */
+ if (wp_async && (vm_flags == VM_UFFD_WP))
+ return true;
+
#ifndef CONFIG_PTE_MARKER_UFFD_WP
/*
* If user requested uffd-wp but not enabled pte markers for
@@ -175,6 +186,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
return false;
#endif
+
+ /* By default, allow any of anon|shmem|hugetlb */
return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
vma_is_shmem(vma);
}
@@ -197,6 +210,7 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma,
extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
+extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
#else /* CONFIG_USERFAULTFD */
@@ -207,6 +221,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf,
return VM_FAULT_SIGBUS;
}
+static inline long uffd_wp_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long len,
+ bool enable_wp)
+{
+ return false;
+}
+
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx vm_ctx)
{
@@ -297,6 +318,11 @@ static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
return false;
}
+static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
+{
+ return false;
+}
+
#endif /* CONFIG_USERFAULTFD */
static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5ec7739400f4..3473b663176f 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -19,10 +19,9 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE 0x01
#define WQ_FLAG_WOKEN 0x02
-#define WQ_FLAG_BOOKMARK 0x04
-#define WQ_FLAG_CUSTOM 0x08
-#define WQ_FLAG_DONE 0x10
-#define WQ_FLAG_PRIORITY 0x20
+#define WQ_FLAG_CUSTOM 0x04
+#define WQ_FLAG_DONE 0x08
+#define WQ_FLAG_PRIORITY 0x10
/*
* A single wait-queue entry structure:
@@ -212,8 +211,6 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
-void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
- unsigned int mode, void *key, wait_queue_entry_t *bookmark);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index c79f1d4c39af..23200aabccac 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -9,12 +9,51 @@
#include <linux/types.h>
#include <linux/tracepoint.h>
+TRACE_EVENT_CONDITION(damos_before_apply,
+
+ TP_PROTO(unsigned int context_idx, unsigned int scheme_idx,
+ unsigned int target_idx, struct damon_region *r,
+ unsigned int nr_regions, bool do_trace),
+
+ TP_ARGS(context_idx, target_idx, scheme_idx, r, nr_regions, do_trace),
+
+ TP_CONDITION(do_trace),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, context_idx)
+ __field(unsigned int, scheme_idx)
+ __field(unsigned long, target_idx)
+ __field(unsigned long, start)
+ __field(unsigned long, end)
+ __field(unsigned int, nr_accesses)
+ __field(unsigned int, age)
+ __field(unsigned int, nr_regions)
+ ),
+
+ TP_fast_assign(
+ __entry->context_idx = context_idx;
+ __entry->scheme_idx = scheme_idx;
+ __entry->target_idx = target_idx;
+ __entry->start = r->ar.start;
+ __entry->end = r->ar.end;
+ __entry->nr_accesses = r->nr_accesses_bp / 10000;
+ __entry->age = r->age;
+ __entry->nr_regions = nr_regions;
+ ),
+
+ TP_printk("ctx_idx=%u scheme_idx=%u target_idx=%lu nr_regions=%u %lu-%lu: %u %u",
+ __entry->context_idx, __entry->scheme_idx,
+ __entry->target_idx, __entry->nr_regions,
+ __entry->start, __entry->end,
+ __entry->nr_accesses, __entry->age)
+);
+
TRACE_EVENT(damon_aggregated,
- TP_PROTO(struct damon_target *t, unsigned int target_id,
- struct damon_region *r, unsigned int nr_regions),
+ TP_PROTO(unsigned int target_id, struct damon_region *r,
+ unsigned int nr_regions),
- TP_ARGS(t, target_id, r, nr_regions),
+ TP_ARGS(target_id, r, nr_regions),
TP_STRUCT__entry(
__field(unsigned long, target_id)
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 061b5128f335..0190ef725b43 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -49,10 +49,11 @@ TRACE_EVENT(mm_migrate_pages,
TP_PROTO(unsigned long succeeded, unsigned long failed,
unsigned long thp_succeeded, unsigned long thp_failed,
- unsigned long thp_split, enum migrate_mode mode, int reason),
+ unsigned long thp_split, unsigned long large_folio_split,
+ enum migrate_mode mode, int reason),
TP_ARGS(succeeded, failed, thp_succeeded, thp_failed,
- thp_split, mode, reason),
+ thp_split, large_folio_split, mode, reason),
TP_STRUCT__entry(
__field( unsigned long, succeeded)
@@ -60,26 +61,29 @@ TRACE_EVENT(mm_migrate_pages,
__field( unsigned long, thp_succeeded)
__field( unsigned long, thp_failed)
__field( unsigned long, thp_split)
+ __field( unsigned long, large_folio_split)
__field( enum migrate_mode, mode)
__field( int, reason)
),
TP_fast_assign(
- __entry->succeeded = succeeded;
- __entry->failed = failed;
- __entry->thp_succeeded = thp_succeeded;
- __entry->thp_failed = thp_failed;
- __entry->thp_split = thp_split;
- __entry->mode = mode;
- __entry->reason = reason;
+ __entry->succeeded = succeeded;
+ __entry->failed = failed;
+ __entry->thp_succeeded = thp_succeeded;
+ __entry->thp_failed = thp_failed;
+ __entry->thp_split = thp_split;
+ __entry->large_folio_split = large_folio_split;
+ __entry->mode = mode;
+ __entry->reason = reason;
),
- TP_printk("nr_succeeded=%lu nr_failed=%lu nr_thp_succeeded=%lu nr_thp_failed=%lu nr_thp_split=%lu mode=%s reason=%s",
+ TP_printk("nr_succeeded=%lu nr_failed=%lu nr_thp_succeeded=%lu nr_thp_failed=%lu nr_thp_split=%lu nr_split=%lu mode=%s reason=%s",
__entry->succeeded,
__entry->failed,
__entry->thp_succeeded,
__entry->thp_failed,
__entry->thp_split,
+ __entry->large_folio_split,
__print_symbolic(__entry->mode, MIGRATE_MODE),
__print_symbolic(__entry->reason, MIGRATE_REASON))
);
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index d2123dd960d5..1a488c30afa5 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -285,10 +285,9 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
unsigned long nr_scanned,
unsigned long nr_skipped,
unsigned long nr_taken,
- isolate_mode_t isolate_mode,
int lru),
- TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru),
+ TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, lru),
TP_STRUCT__entry(
__field(int, highest_zoneidx)
@@ -297,7 +296,6 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
__field(unsigned long, nr_scanned)
__field(unsigned long, nr_skipped)
__field(unsigned long, nr_taken)
- __field(unsigned int, isolate_mode)
__field(int, lru)
),
@@ -308,7 +306,6 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
__entry->nr_scanned = nr_scanned;
__entry->nr_skipped = nr_skipped;
__entry->nr_taken = nr_taken;
- __entry->isolate_mode = (__force unsigned int)isolate_mode;
__entry->lru = lru;
),
@@ -316,8 +313,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
* classzone is previous name of the highest_zoneidx.
* Reason not to change it is the ABI requirement of the tracepoint.
*/
- TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s",
- __entry->isolate_mode,
+ TP_printk("classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s",
__entry->highest_zoneidx,
__entry->order,
__entry->nr_requested,
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index b7b56871029c..da43810b7485 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -305,4 +305,63 @@ typedef int __bitwise __kernel_rwf_t;
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
RWF_APPEND)
+/* Pagemap ioctl */
+#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg)
+
+/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */
+#define PAGE_IS_WPALLOWED (1 << 0)
+#define PAGE_IS_WRITTEN (1 << 1)
+#define PAGE_IS_FILE (1 << 2)
+#define PAGE_IS_PRESENT (1 << 3)
+#define PAGE_IS_SWAPPED (1 << 4)
+#define PAGE_IS_PFNZERO (1 << 5)
+#define PAGE_IS_HUGE (1 << 6)
+
+/*
+ * struct page_region - Page region with flags
+ * @start: Start of the region
+ * @end: End of the region (exclusive)
+ * @categories: PAGE_IS_* category bitmask for the region
+ */
+struct page_region {
+ __u64 start;
+ __u64 end;
+ __u64 categories;
+};
+
+/* Flags for PAGEMAP_SCAN ioctl */
+#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */
+#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */
+
+/*
+ * struct pm_scan_arg - Pagemap ioctl argument
+ * @size: Size of the structure
+ * @flags: Flags for the IOCTL
+ * @start: Starting address of the region
+ * @end: Ending address of the region
+ * @walk_end Address where the scan stopped (written by kernel).
+ * walk_end == end (address tags cleared) informs that the scan completed on entire range.
+ * @vec: Address of page_region struct array for output
+ * @vec_len: Length of the page_region struct array
+ * @max_pages: Optional limit for number of returned pages (0 = disabled)
+ * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1
+ * @category_mask: Skip pages for which any category doesn't match
+ * @category_anyof_mask: Skip pages for which no category matches
+ * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned
+ */
+struct pm_scan_arg {
+ __u64 size;
+ __u64 flags;
+ __u64 start;
+ __u64 end;
+ __u64 walk_end;
+ __u64 vec;
+ __u64 vec_len;
+ __u64 max_pages;
+ __u64 category_inverted;
+ __u64 category_mask;
+ __u64 category_anyof_mask;
+ __u64 return_mask;
+};
+
#endif /* _UAPI_LINUX_FS_H */
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 046d0ccba4cd..a8963f7ef4c2 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -48,7 +48,7 @@ enum {
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
to policy */
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
-#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
+#define MPOL_MF_LAZY (1<<3) /* UNSUPPORTED FLAG: Lazy migrate on fault */
#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 3c36aeade991..370ed14b1ae0 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -283,7 +283,8 @@ struct prctl_mm_map {
/* Memory deny write / execute */
#define PR_SET_MDWE 65
-# define PR_MDWE_REFUSE_EXEC_GAIN 1
+# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0)
+# define PR_MDWE_NO_INHERIT (1UL << 1)
#define PR_GET_MDWE 66
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 62151706c5a3..0dbc81015018 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -40,7 +40,8 @@
UFFD_FEATURE_EXACT_ADDRESS | \
UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \
UFFD_FEATURE_WP_UNPOPULATED | \
- UFFD_FEATURE_POISON)
+ UFFD_FEATURE_POISON | \
+ UFFD_FEATURE_WP_ASYNC)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -216,6 +217,11 @@ struct uffdio_api {
* (i.e. empty ptes). This will be the default behavior for shmem
* & hugetlbfs, so this flag only affects anonymous memory behavior
* when userfault write-protection mode is registered.
+ *
+ * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection
+ * asynchronous mode is supported in which the write fault is
+ * automatically resolved and write-protection is un-set.
+ * It implies UFFD_FEATURE_WP_UNPOPULATED.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -232,6 +238,7 @@ struct uffdio_api {
#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12)
#define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
#define UFFD_FEATURE_POISON (1<<14)
+#define UFFD_FEATURE_WP_ASYNC (1<<15)
__u64 features;
__u64 ioctls;