summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-09 11:18:47 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-09 11:18:47 -0800
commitfb46e22a9e3863e08aef8815df9f17d0f4b9aede (patch)
tree83e052911fa8d8d90bcf9de2796e17e19040613f /include
parentd30e51aa7b1f6fa7dd78d4598d1e4c047fcc3fb9 (diff)
parent5e0a760b44417f7cadd79de2204d6247109558a0 (diff)
Merge tag 'mm-stable-2024-01-08-15-31' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: "Many singleton patches against the MM code. The patch series which are included in this merge do the following: - Peng Zhang has done some mapletree maintainance work in the series 'maple_tree: add mt_free_one() and mt_attr() helpers' 'Some cleanups of maple tree' - In the series 'mm: use memmap_on_memory semantics for dax/kmem' Vishal Verma has altered the interworking between memory-hotplug and dax/kmem so that newly added 'device memory' can more easily have its memmap placed within that newly added memory. - Matthew Wilcox continues folio-related work (including a few fixes) in the patch series 'Add folio_zero_tail() and folio_fill_tail()' 'Make folio_start_writeback return void' 'Fix fault handler's handling of poisoned tail pages' 'Convert aops->error_remove_page to ->error_remove_folio' 'Finish two folio conversions' 'More swap folio conversions' - Kefeng Wang has also contributed folio-related work in the series 'mm: cleanup and use more folio in page fault' - Jim Cromie has improved the kmemleak reporting output in the series 'tweak kmemleak report format'. - In the series 'stackdepot: allow evicting stack traces' Andrey Konovalov to permits clients (in this case KASAN) to cause eviction of no longer needed stack traces. - Charan Teja Kalla has fixed some accounting issues in the page allocator's atomic reserve calculations in the series 'mm: page_alloc: fixes for high atomic reserve caluculations'. - Dmitry Rokosov has added to the samples/ dorectory some sample code for a userspace memcg event listener application. See the series 'samples: introduce cgroup events listeners'. - Some mapletree maintanance work from Liam Howlett in the series 'maple_tree: iterator state changes'. - Nhat Pham has improved zswap's approach to writeback in the series 'workload-specific and memory pressure-driven zswap writeback'. - DAMON/DAMOS feature and maintenance work from SeongJae Park in the series 'mm/damon: let users feed and tame/auto-tune DAMOS' 'selftests/damon: add Python-written DAMON functionality tests' 'mm/damon: misc updates for 6.8' - Yosry Ahmed has improved memcg's stats flushing in the series 'mm: memcg: subtree stats flushing and thresholds'. - In the series 'Multi-size THP for anonymous memory' Ryan Roberts has added a runtime opt-in feature to transparent hugepages which improves performance by allocating larger chunks of memory during anonymous page faults. - Matthew Wilcox has also contributed some cleanup and maintenance work against eh buffer_head code int he series 'More buffer_head cleanups'. - Suren Baghdasaryan has done work on Andrea Arcangeli's series 'userfaultfd move option'. UFFDIO_MOVE permits userspace heap compaction algorithms to move userspace's pages around rather than UFFDIO_COPY'a alloc/copy/free. - Stefan Roesch has developed a 'KSM Advisor', in the series 'mm/ksm: Add ksm advisor'. This is a governor which tunes KSM's scanning aggressiveness in response to userspace's current needs. - Chengming Zhou has optimized zswap's temporary working memory use in the series 'mm/zswap: dstmem reuse optimizations and cleanups'. - Matthew Wilcox has performed some maintenance work on the writeback code, both code and within filesystems. The series is 'Clean up the writeback paths'. - Andrey Konovalov has optimized KASAN's handling of alloc and free stack traces for secondary-level allocators, in the series 'kasan: save mempool stack traces'. - Andrey also performed some KASAN maintenance work in the series 'kasan: assorted clean-ups'. - David Hildenbrand has gone to town on the rmap code. Cleanups, more pte batching, folio conversions and more. See the series 'mm/rmap: interface overhaul'. - Kinsey Ho has contributed some maintenance work on the MGLRU code in the series 'mm/mglru: Kconfig cleanup'. - Matthew Wilcox has contributed lruvec page accounting code cleanups in the series 'Remove some lruvec page accounting functions'" * tag 'mm-stable-2024-01-08-15-31' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (361 commits) mm, treewide: rename MAX_ORDER to MAX_PAGE_ORDER mm, treewide: introduce NR_PAGE_ORDERS selftests/mm: add separate UFFDIO_MOVE test for PMD splitting selftests/mm: skip test if application doesn't has root privileges selftests/mm: conform test to TAP format output selftests: mm: hugepage-mmap: conform to TAP format output selftests/mm: gup_test: conform test to TAP format output mm/selftests: hugepage-mremap: conform test to TAP format output mm/vmstat: move pgdemote_* out of CONFIG_NUMA_BALANCING mm: zsmalloc: return -ENOSPC rather than -EINVAL in zs_malloc while size is too large mm/memcontrol: remove __mod_lruvec_page_state() mm/khugepaged: use a folio more in collapse_file() slub: use a folio in __kmalloc_large_node slub: use folio APIs in free_large_kmalloc() slub: use alloc_pages_node() in alloc_slab_page() mm: remove inc/dec lruvec page state functions mm: ratelimit stat flush from workingset shrinker kasan: stop leaking stack trace handles mm/mglru: remove CONFIG_TRANSPARENT_HUGEPAGE mm/mglru: add dummy pmd_dirty() ...
Diffstat (limited to 'include')
-rw-r--r--include/drm/ttm/ttm_pool.h2
-rw-r--r--include/linux/buffer_head.h9
-rw-r--r--include/linux/cpuhotplug.h1
-rw-r--r--include/linux/damon.h22
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/gfp_types.h17
-rw-r--r--include/linux/highmem.h76
-rw-r--r--include/linux/huge_mm.h184
-rw-r--r--include/linux/hugetlb.h2
-rw-r--r--include/linux/kasan.h162
-rw-r--r--include/linux/ksm.h10
-rw-r--r--include/linux/list_lru.h88
-rw-r--r--include/linux/maple_tree.h349
-rw-r--r--include/linux/memblock.h1
-rw-r--r--include/linux/memcontrol.h37
-rw-r--r--include/linux/mempool.h1
-rw-r--r--include/linux/memremap.h12
-rw-r--r--include/linux/mm.h50
-rw-r--r--include/linux/mm_types.h27
-rw-r--r--include/linux/mmzone.h66
-rw-r--r--include/linux/page-flags.h9
-rw-r--r--include/linux/pageblock-flags.h4
-rw-r--r--include/linux/pgtable.h9
-rw-r--r--include/linux/rmap.h411
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/stackdepot.h61
-rw-r--r--include/linux/swap.h8
-rw-r--r--include/linux/userfaultfd_k.h11
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmstat.h60
-rw-r--r--include/linux/zswap.h32
-rw-r--r--include/trace/events/ksm.h33
-rw-r--r--include/uapi/linux/fs.h1
-rw-r--r--include/uapi/linux/userfaultfd.h29
34 files changed, 1317 insertions, 475 deletions
diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
index 30a347e5aa11..4490d43c63e3 100644
--- a/include/drm/ttm/ttm_pool.h
+++ b/include/drm/ttm/ttm_pool.h
@@ -74,7 +74,7 @@ struct ttm_pool {
bool use_dma32;
struct {
- struct ttm_pool_type orders[MAX_ORDER + 1];
+ struct ttm_pool_type orders[NR_PAGE_ORDERS];
} caching[TTM_NUM_CACHING_TYPES];
};
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 5f23ee599889..d78454a4dd1f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -205,7 +205,6 @@ struct buffer_head *create_empty_buffers(struct folio *folio,
unsigned long blocksize, unsigned long b_state);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
-void end_buffer_async_write(struct buffer_head *bh, int uptodate);
/* Things to do with buffers at mapping->private_list */
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
@@ -252,11 +251,10 @@ void __bh_read_batch(int nr, struct buffer_head *bhs[],
* address_spaces.
*/
void block_invalidate_folio(struct folio *folio, size_t offset, size_t length);
-int block_write_full_page(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc);
+int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
+ void *get_block);
int __block_write_full_folio(struct inode *inode, struct folio *folio,
- get_block_t *get_block, struct writeback_control *wbc,
- bh_end_io_t *handler);
+ get_block_t *get_block, struct writeback_control *wbc);
int block_read_full_folio(struct folio *, get_block_t *);
bool block_is_partially_uptodate(struct folio *, size_t from, size_t count);
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
@@ -270,7 +268,6 @@ int generic_write_end(struct file *, struct address_space *,
loff_t, unsigned, unsigned,
struct page *, void *);
void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to);
-void clean_page_buffers(struct page *page);
int cont_write_begin(struct file *, struct address_space *, loff_t,
unsigned, struct page **, void **,
get_block_t *, loff_t *);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 4f628d6a69d6..172d0a743e5d 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -118,7 +118,6 @@ enum cpuhp_state {
CPUHP_ARM_BL_PREPARE,
CPUHP_TRACE_RB_PREPARE,
CPUHP_MM_ZS_PREPARE,
- CPUHP_MM_ZSWP_MEM_PREPARE,
CPUHP_MM_ZSWP_POOL_PREPARE,
CPUHP_KVM_PPC_BOOK3S_PREPARE,
CPUHP_ZCOMP_PREPARE,
diff --git a/include/linux/damon.h b/include/linux/damon.h
index e00ddf1ed39c..5881e4ac30be 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -2,7 +2,7 @@
/*
* DAMON api
*
- * Author: SeongJae Park <sjpark@amazon.de>
+ * Author: SeongJae Park <sj@kernel.org>
*/
#ifndef _DAMON_H_
@@ -136,6 +136,9 @@ enum damos_action {
* @weight_nr_accesses: Weight of the region's nr_accesses for prioritization.
* @weight_age: Weight of the region's age for prioritization.
*
+ * @get_score: Feedback function for self-tuning quota.
+ * @get_score_arg: Parameter for @get_score
+ *
* To avoid consuming too much CPU time or IO resources for applying the
* &struct damos->action to large memory, DAMON allows users to set time and/or
* size quotas. The quotas can be set by writing non-zero values to &ms and
@@ -153,6 +156,17 @@ enum damos_action {
* You could customize the prioritization logic by setting &weight_sz,
* &weight_nr_accesses, and &weight_age, because monitoring operations are
* encouraged to respect those.
+ *
+ * If @get_score function pointer is set, DAMON calls it back with
+ * @get_score_arg and get the return value of it for every @reset_interval.
+ * Then, DAMON adjusts the effective quota using the return value as a feedback
+ * score to the current quota, using its internal feedback loop algorithm.
+ *
+ * The feedback loop algorithem assumes the quota input and the feedback score
+ * output are in a positive proportional relationship, and the goal of the
+ * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are
+ * set together, those work as a hard limit quota. If neither @ms nor @sz are
+ * set, the mechanism starts from the quota of one byte.
*/
struct damos_quota {
unsigned long ms;
@@ -163,6 +177,9 @@ struct damos_quota {
unsigned int weight_nr_accesses;
unsigned int weight_age;
+ unsigned long (*get_score)(void *arg);
+ void *get_score_arg;
+
/* private: */
/* For throughput estimation */
unsigned long total_charged_sz;
@@ -179,6 +196,9 @@ struct damos_quota {
/* For prioritization */
unsigned long histogram[DAMOS_MAX_SCORE + 1];
unsigned int min_score;
+
+ /* For feedback loop */
+ unsigned long esz_bp;
};
/**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cdbf43a8ea88..9314e8541745 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -434,7 +434,7 @@ struct address_space_operations {
bool (*is_partially_uptodate) (struct folio *, size_t from,
size_t count);
void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb);
- int (*error_remove_page)(struct address_space *, struct page *);
+ int (*error_remove_folio)(struct address_space *, struct folio *);
/* swapfile support */
int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 6583a58670c5..1b6053da8754 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -162,25 +162,25 @@ typedef unsigned int __bitwise gfp_t;
* %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
*
* The default allocator behavior depends on the request size. We have a concept
- * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
+ * of so-called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
* !costly allocations are too essential to fail so they are implicitly
* non-failing by default (with some exceptions like OOM victims might fail so
* the caller still has to check for failures) while costly requests try to be
* not disruptive and back off even without invoking the OOM killer.
* The following three modifiers might be used to override some of these
- * implicit rules
+ * implicit rules.
*
* %__GFP_NORETRY: The VM implementation will try only very lightweight
* memory direct reclaim to get some memory under memory pressure (thus
* it can sleep). It will avoid disruptive actions like OOM killer. The
* caller must handle the failure which is quite likely to happen under
* heavy memory pressure. The flag is suitable when failure can easily be
- * handled at small cost, such as reduced throughput
+ * handled at small cost, such as reduced throughput.
*
* %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
* procedures that have previously failed if there is some indication
- * that progress has been made else where. It can wait for other
- * tasks to attempt high level approaches to freeing memory such as
+ * that progress has been made elsewhere. It can wait for other
+ * tasks to attempt high-level approaches to freeing memory such as
* compaction (which removes fragmentation) and page-out.
* There is still a definite limit to the number of retries, but it is
* a larger limit than with %__GFP_NORETRY.
@@ -230,7 +230,7 @@ typedef unsigned int __bitwise gfp_t;
* is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that
* __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting
* memory tags at the same time as zeroing memory has minimal additional
- * performace impact.
+ * performance impact.
*
* %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation.
* Used for userspace and vmalloc pages; the latter are unpoisoned by
@@ -274,7 +274,8 @@ typedef unsigned int __bitwise gfp_t;
* accounted to kmemcg.
*
* %GFP_NOWAIT is for kernel allocations that should not stall for direct
- * reclaim, start physical IO or use any filesystem callback.
+ * reclaim, start physical IO or use any filesystem callback. It is very
+ * likely to fail to allocate memory, even for very small allocations.
*
* %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
* that do not require the starting of any physical IO.
@@ -325,7 +326,7 @@ typedef unsigned int __bitwise gfp_t;
#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
-#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
+#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM | __GFP_NOWARN)
#define GFP_NOIO (__GFP_RECLAIM)
#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO)
#define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index be20cff4ba73..451c1dff0e87 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -484,6 +484,82 @@ static inline void memcpy_to_folio(struct folio *folio, size_t offset,
}
/**
+ * folio_zero_tail - Zero the tail of a folio.
+ * @folio: The folio to zero.
+ * @offset: The byte offset in the folio to start zeroing at.
+ * @kaddr: The address the folio is currently mapped to.
+ *
+ * If you have already used kmap_local_folio() to map a folio, written
+ * some data to it and now need to zero the end of the folio (and flush
+ * the dcache), you can use this function. If you do not have the
+ * folio kmapped (eg the folio has been partially populated by DMA),
+ * use folio_zero_range() or folio_zero_segment() instead.
+ *
+ * Return: An address which can be passed to kunmap_local().
+ */
+static inline __must_check void *folio_zero_tail(struct folio *folio,
+ size_t offset, void *kaddr)
+{
+ size_t len = folio_size(folio) - offset;
+
+ if (folio_test_highmem(folio)) {
+ size_t max = PAGE_SIZE - offset_in_page(offset);
+
+ while (len > max) {
+ memset(kaddr, 0, max);
+ kunmap_local(kaddr);
+ len -= max;
+ offset += max;
+ max = PAGE_SIZE;
+ kaddr = kmap_local_folio(folio, offset);
+ }
+ }
+
+ memset(kaddr, 0, len);
+ flush_dcache_folio(folio);
+
+ return kaddr;
+}
+
+/**
+ * folio_fill_tail - Copy some data to a folio and pad with zeroes.
+ * @folio: The destination folio.
+ * @offset: The offset into @folio at which to start copying.
+ * @from: The data to copy.
+ * @len: How many bytes of data to copy.
+ *
+ * This function is most useful for filesystems which support inline data.
+ * When they want to copy data from the inode into the page cache, this
+ * function does everything for them. It supports large folios even on
+ * HIGHMEM configurations.
+ */
+static inline void folio_fill_tail(struct folio *folio, size_t offset,
+ const char *from, size_t len)
+{
+ char *to = kmap_local_folio(folio, offset);
+
+ VM_BUG_ON(offset + len > folio_size(folio));
+
+ if (folio_test_highmem(folio)) {
+ size_t max = PAGE_SIZE - offset_in_page(offset);
+
+ while (len > max) {
+ memcpy(to, from, max);
+ kunmap_local(to);
+ len -= max;
+ from += max;
+ offset += max;
+ max = PAGE_SIZE;
+ to = kmap_local_folio(folio, offset);
+ }
+ }
+
+ memcpy(to, from, len);
+ to = folio_zero_tail(folio, offset + len, to + len);
+ kunmap_local(to);
+}
+
+/**
* memcpy_from_file_folio - Copy some bytes from a file folio.
* @to: The destination buffer.
* @folio: The folio to copy from.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index fa0350b0812a..5adb86af35fc 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -67,6 +67,26 @@ extern struct kobj_attribute shmem_enabled_attr;
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
+/*
+ * Mask of all large folio orders supported for anonymous THP; all orders up to
+ * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
+ * (which is a limitation of the THP implementation).
+ */
+#define THP_ORDERS_ALL_ANON ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))
+
+/*
+ * Mask of all large folio orders supported for file THP.
+ */
+#define THP_ORDERS_ALL_FILE (BIT(PMD_ORDER) | BIT(PUD_ORDER))
+
+/*
+ * Mask of all large folio orders supported for THP.
+ */
+#define THP_ORDERS_ALL (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)
+
+#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \
+ (!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order)))
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
@@ -77,45 +97,105 @@ extern struct kobj_attribute shmem_enabled_attr;
#define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1))
extern unsigned long transparent_hugepage_flags;
+extern unsigned long huge_anon_orders_always;
+extern unsigned long huge_anon_orders_madvise;
+extern unsigned long huge_anon_orders_inherit;
-#define hugepage_flags_enabled() \
- (transparent_hugepage_flags & \
- ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \
- (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
-#define hugepage_flags_always() \
- (transparent_hugepage_flags & \
- (1<<TRANSPARENT_HUGEPAGE_FLAG))
+static inline bool hugepage_global_enabled(void)
+{
+ return transparent_hugepage_flags &
+ ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
+}
+
+static inline bool hugepage_global_always(void)
+{
+ return transparent_hugepage_flags &
+ (1<<TRANSPARENT_HUGEPAGE_FLAG);
+}
+
+static inline bool hugepage_flags_enabled(void)
+{
+ /*
+ * We cover both the anon and the file-backed case here; we must return
+ * true if globally enabled, even when all anon sizes are set to never.
+ * So we don't need to look at huge_anon_orders_inherit.
+ */
+ return hugepage_global_enabled() ||
+ huge_anon_orders_always ||
+ huge_anon_orders_madvise;
+}
+
+static inline int highest_order(unsigned long orders)
+{
+ return fls_long(orders) - 1;
+}
+
+static inline int next_order(unsigned long *orders, int prev)
+{
+ *orders &= ~BIT(prev);
+ return highest_order(*orders);
+}
/*
* Do the below checks:
* - For file vma, check if the linear page offset of vma is
- * HPAGE_PMD_NR aligned within the file. The hugepage is
- * guaranteed to be hugepage-aligned within the file, but we must
- * check that the PMD-aligned addresses in the VMA map to
- * PMD-aligned offsets within the file, else the hugepage will
- * not be PMD-mappable.
- * - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE
+ * order-aligned within the file. The hugepage is
+ * guaranteed to be order-aligned within the file, but we must
+ * check that the order-aligned addresses in the VMA map to
+ * order-aligned offsets within the file, else the hugepage will
+ * not be mappable.
+ * - For all vmas, check if the haddr is in an aligned hugepage
* area.
*/
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
- unsigned long addr)
+static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
+ unsigned long addr, int order)
{
+ unsigned long hpage_size = PAGE_SIZE << order;
unsigned long haddr;
/* Don't have to check pgoff for anonymous vma */
if (!vma_is_anonymous(vma)) {
if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
- HPAGE_PMD_NR))
+ hpage_size >> PAGE_SHIFT))
return false;
}
- haddr = addr & HPAGE_PMD_MASK;
+ haddr = ALIGN_DOWN(addr, hpage_size);
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end)
return false;
return true;
}
+/*
+ * Filter the bitfield of input orders to the ones suitable for use in the vma.
+ * See thp_vma_suitable_order().
+ * All orders that pass the checks are returned as a bitfield.
+ */
+static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long orders)
+{
+ int order;
+
+ /*
+ * Iterate over orders, highest to lowest, removing orders that don't
+ * meet alignment requirements from the set. Exit loop at first order
+ * that meets requirements, since all lower orders must also meet
+ * requirements.
+ */
+
+ order = highest_order(orders);
+
+ while (orders) {
+ if (thp_vma_suitable_order(vma, addr, order))
+ break;
+ order = next_order(&orders, order);
+ }
+
+ return orders;
+}
+
static inline bool file_thp_enabled(struct vm_area_struct *vma)
{
struct inode *inode;
@@ -126,12 +206,55 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
inode = vma->vm_file->f_inode;
return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) &&
- (vma->vm_flags & VM_EXEC) &&
!inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}
-bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
- bool smaps, bool in_pf, bool enforce_sysfs);
+unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
+ unsigned long vm_flags, bool smaps,
+ bool in_pf, bool enforce_sysfs,
+ unsigned long orders);
+
+/**
+ * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
+ * @vma: the vm area to check
+ * @vm_flags: use these vm_flags instead of vma->vm_flags
+ * @smaps: whether answer will be used for smaps file
+ * @in_pf: whether answer will be used by page fault handler
+ * @enforce_sysfs: whether sysfs config should be taken into account
+ * @orders: bitfield of all orders to consider
+ *
+ * Calculates the intersection of the requested hugepage orders and the allowed
+ * hugepage orders for the provided vma. Permitted orders are encoded as a set
+ * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3
+ * corresponds to order-3, etc). Order-0 is never considered a hugepage order.
+ *
+ * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage
+ * orders are allowed.
+ */
+static inline
+unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
+ unsigned long vm_flags, bool smaps,
+ bool in_pf, bool enforce_sysfs,
+ unsigned long orders)
+{
+ /* Optimization to check if required orders are enabled early. */
+ if (enforce_sysfs && vma_is_anonymous(vma)) {
+ unsigned long mask = READ_ONCE(huge_anon_orders_always);
+
+ if (vm_flags & VM_HUGEPAGE)
+ mask |= READ_ONCE(huge_anon_orders_madvise);
+ if (hugepage_global_always() ||
+ ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
+ mask |= READ_ONCE(huge_anon_orders_inherit);
+
+ orders &= mask;
+ if (!orders)
+ return 0;
+ }
+
+ return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
+ enforce_sysfs, orders);
+}
#define transparent_hugepage_use_zero_page() \
(transparent_hugepage_flags & \
@@ -267,17 +390,24 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
return false;
}
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
- unsigned long addr)
+static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
+ unsigned long addr, int order)
{
return false;
}
-static inline bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags, bool smaps,
- bool in_pf, bool enforce_sysfs)
+static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long orders)
{
- return false;
+ return 0;
+}
+
+static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
+ unsigned long vm_flags, bool smaps,
+ bool in_pf, bool enforce_sysfs,
+ unsigned long orders)
+{
+ return 0;
}
static inline void folio_prep_large_rmappable(struct folio *folio) {}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 236ec7b63c54..c1ee640d87b1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -829,7 +829,7 @@ static inline unsigned huge_page_shift(struct hstate *h)
static inline bool hstate_is_gigantic(struct hstate *h)
{
- return huge_page_order(h) > MAX_ORDER;
+ return huge_page_order(h) > MAX_PAGE_ORDER;
}
static inline unsigned int pages_per_huge_page(const struct hstate *h)
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 72cb693b075b..dbb06d789e74 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -4,6 +4,7 @@
#include <linux/bug.h>
#include <linux/kasan-enabled.h>
+#include <linux/kasan-tags.h>
#include <linux/kernel.h>
#include <linux/static_key.h>
#include <linux/types.h>
@@ -129,20 +130,39 @@ static __always_inline void kasan_poison_slab(struct slab *slab)
__kasan_poison_slab(slab);
}
-void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
-static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+void __kasan_unpoison_new_object(struct kmem_cache *cache, void *object);
+/**
+ * kasan_unpoison_new_object - Temporarily unpoison a new slab object.
+ * @cache: Cache the object belong to.
+ * @object: Pointer to the object.
+ *
+ * This function is intended for the slab allocator's internal use. It
+ * temporarily unpoisons an object from a newly allocated slab without doing
+ * anything else. The object must later be repoisoned by
+ * kasan_poison_new_object().
+ */
+static __always_inline void kasan_unpoison_new_object(struct kmem_cache *cache,
void *object)
{
if (kasan_enabled())
- __kasan_unpoison_object_data(cache, object);
+ __kasan_unpoison_new_object(cache, object);
}
-void __kasan_poison_object_data(struct kmem_cache *cache, void *object);
-static __always_inline void kasan_poison_object_data(struct kmem_cache *cache,
+void __kasan_poison_new_object(struct kmem_cache *cache, void *object);
+/**
+ * kasan_unpoison_new_object - Repoison a new slab object.
+ * @cache: Cache the object belong to.
+ * @object: Pointer to the object.
+ *
+ * This function is intended for the slab allocator's internal use. It
+ * repoisons an object that was previously unpoisoned by
+ * kasan_unpoison_new_object() without doing anything else.
+ */
+static __always_inline void kasan_poison_new_object(struct kmem_cache *cache,
void *object)
{
if (kasan_enabled())
- __kasan_poison_object_data(cache, object);
+ __kasan_poison_new_object(cache, object);
}
void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
@@ -172,13 +192,6 @@ static __always_inline void kasan_kfree_large(void *ptr)
__kasan_kfree_large(ptr, _RET_IP_);
}
-void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
-static __always_inline void kasan_slab_free_mempool(void *ptr)
-{
- if (kasan_enabled())
- __kasan_slab_free_mempool(ptr, _RET_IP_);
-}
-
void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
void *object, gfp_t flags, bool init);
static __always_inline void * __must_check kasan_slab_alloc(
@@ -219,6 +232,113 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
return (void *)object;
}
+bool __kasan_mempool_poison_pages(struct page *page, unsigned int order,
+ unsigned long ip);
+/**
+ * kasan_mempool_poison_pages - Check and poison a mempool page allocation.
+ * @page: Pointer to the page allocation.
+ * @order: Order of the allocation.
+ *
+ * This function is intended for kernel subsystems that cache page allocations
+ * to reuse them instead of freeing them back to page_alloc (e.g. mempool).
+ *
+ * This function is similar to kasan_mempool_poison_object() but operates on
+ * page allocations.
+ *
+ * Before the poisoned allocation can be reused, it must be unpoisoned via
+ * kasan_mempool_unpoison_pages().
+ *
+ * Return: true if the allocation can be safely reused; false otherwise.
+ */
+static __always_inline bool kasan_mempool_poison_pages(struct page *page,
+ unsigned int order)
+{
+ if (kasan_enabled())
+ return __kasan_mempool_poison_pages(page, order, _RET_IP_);
+ return true;
+}
+
+void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order,
+ unsigned long ip);
+/**
+ * kasan_mempool_unpoison_pages - Unpoison a mempool page allocation.
+ * @page: Pointer to the page allocation.
+ * @order: Order of the allocation.
+ *
+ * This function is intended for kernel subsystems that cache page allocations
+ * to reuse them instead of freeing them back to page_alloc (e.g. mempool).
+ *
+ * This function unpoisons a page allocation that was previously poisoned by
+ * kasan_mempool_poison_pages() without zeroing the allocation's memory. For
+ * the tag-based modes, this function assigns a new tag to the allocation.
+ */
+static __always_inline void kasan_mempool_unpoison_pages(struct page *page,
+ unsigned int order)
+{
+ if (kasan_enabled())
+ __kasan_mempool_unpoison_pages(page, order, _RET_IP_);
+}
+
+bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
+/**
+ * kasan_mempool_poison_object - Check and poison a mempool slab allocation.
+ * @ptr: Pointer to the slab allocation.
+ *
+ * This function is intended for kernel subsystems that cache slab allocations
+ * to reuse them instead of freeing them back to the slab allocator (e.g.
+ * mempool).
+ *
+ * This function poisons a slab allocation and saves a free stack trace for it
+ * without initializing the allocation's memory and without putting it into the
+ * quarantine (for the Generic mode).
+ *
+ * This function also performs checks to detect double-free and invalid-free
+ * bugs and reports them. The caller can use the return value of this function
+ * to find out if the allocation is buggy.
+ *
+ * Before the poisoned allocation can be reused, it must be unpoisoned via
+ * kasan_mempool_unpoison_object().
+ *
+ * This function operates on all slab allocations including large kmalloc
+ * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
+ * size > KMALLOC_MAX_SIZE).
+ *
+ * Return: true if the allocation can be safely reused; false otherwise.
+ */
+static __always_inline bool kasan_mempool_poison_object(void *ptr)
+{
+ if (kasan_enabled())
+ return __kasan_mempool_poison_object(ptr, _RET_IP_);
+ return true;
+}
+
+void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip);
+/**
+ * kasan_mempool_unpoison_object - Unpoison a mempool slab allocation.
+ * @ptr: Pointer to the slab allocation.
+ * @size: Size to be unpoisoned.
+ *
+ * This function is intended for kernel subsystems that cache slab allocations
+ * to reuse them instead of freeing them back to the slab allocator (e.g.
+ * mempool).
+ *
+ * This function unpoisons a slab allocation that was previously poisoned via
+ * kasan_mempool_poison_object() and saves an alloc stack trace for it without
+ * initializing the allocation's memory. For the tag-based modes, this function
+ * does not assign a new tag to the allocation and instead restores the
+ * original tags based on the pointer value.
+ *
+ * This function operates on all slab allocations including large kmalloc
+ * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
+ * size > KMALLOC_MAX_SIZE).
+ */
+static __always_inline void kasan_mempool_unpoison_object(void *ptr,
+ size_t size)
+{
+ if (kasan_enabled())
+ __kasan_mempool_unpoison_object(ptr, size, _RET_IP_);
+}
+
/*
* Unlike kasan_check_read/write(), kasan_check_byte() is performed even for
* the hardware tag-based mode that doesn't rely on compiler instrumentation.
@@ -242,9 +362,9 @@ static inline bool kasan_unpoison_pages(struct page *page, unsigned int order,
return false;
}
static inline void kasan_poison_slab(struct slab *slab) {}
-static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+static inline void kasan_unpoison_new_object(struct kmem_cache *cache,
void *object) {}
-static inline void kasan_poison_object_data(struct kmem_cache *cache,
+static inline void kasan_poison_new_object(struct kmem_cache *cache,
void *object) {}
static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
const void *object)
@@ -256,7 +376,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init
return false;
}
static inline void kasan_kfree_large(void *ptr) {}
-static inline void kasan_slab_free_mempool(void *ptr) {}
static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
gfp_t flags, bool init)
{
@@ -276,6 +395,17 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
{
return (void *)object;
}
+static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int order)
+{
+ return true;
+}
+static inline void kasan_mempool_unpoison_pages(struct page *page, unsigned int order) {}
+static inline bool kasan_mempool_poison_object(void *ptr)
+{
+ return true;
+}
+static inline void kasan_mempool_unpoison_object(void *ptr, size_t size) {}
+
static inline bool kasan_check_byte(const void *address)
{
return true;
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index c2dd786a30e1..401348e9f92b 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -76,8 +76,8 @@ static inline void ksm_exit(struct mm_struct *mm)
* We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
* but what if the vma was unmerged while the page was swapped out?
*/
-struct page *ksm_might_need_to_copy(struct page *page,
- struct vm_area_struct *vma, unsigned long address);
+struct folio *ksm_might_need_to_copy(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr);
void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
@@ -129,10 +129,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
return 0;
}
-static inline struct page *ksm_might_need_to_copy(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+static inline struct folio *ksm_might_need_to_copy(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr)
{
- return page;
+ return folio;
}
static inline void rmap_walk_ksm(struct folio *folio,
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index b35968ee9fb5..7675a48a0701 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -73,8 +73,10 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
/**
* list_lru_add: add an element to the lru list's tail
- * @list_lru: the lru pointer
+ * @lru: the lru pointer
* @item: the item to be added.
+ * @nid: the node id of the sublist to add the item to.
+ * @memcg: the cgroup of the sublist to add the item to.
*
* If the element is already part of a list, this function returns doing
* nothing. Therefore the caller does not need to keep state about whether or
@@ -83,24 +85,54 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
* the caller organize itself in a way that elements can be in more than
* one type of list, it is up to the caller to fully remove the item from
* the previous list (with list_lru_del() for instance) before moving it
- * to @list_lru
+ * to @lru.
+ *
+ * Return: true if the list was updated, false otherwise
+ */
+bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
+ struct mem_cgroup *memcg);
+
+/**
+ * list_lru_add_obj: add an element to the lru list's tail
+ * @lru: the lru pointer
+ * @item: the item to be added.
+ *
+ * This function is similar to list_lru_add(), but the NUMA node and the
+ * memcg of the sublist is determined by @item list_head. This assumption is
+ * valid for slab objects LRU such as dentries, inodes, etc.
*
* Return value: true if the list was updated, false otherwise
*/
-bool list_lru_add(struct list_lru *lru, struct list_head *item);
+bool list_lru_add_obj(struct list_lru *lru, struct list_head *item);
/**
- * list_lru_del: delete an element to the lru list
- * @list_lru: the lru pointer
+ * list_lru_del: delete an element from the lru list
+ * @lru: the lru pointer
* @item: the item to be deleted.
+ * @nid: the node id of the sublist to delete the item from.
+ * @memcg: the cgroup of the sublist to delete the item from.
*
- * This function works analogously as list_lru_add in terms of list
+ * This function works analogously as list_lru_add() in terms of list
* manipulation. The comments about an element already pertaining to
- * a list are also valid for list_lru_del.
+ * a list are also valid for list_lru_del().
*
- * Return value: true if the list was updated, false otherwise
+ * Return: true if the list was updated, false otherwise
*/
-bool list_lru_del(struct list_lru *lru, struct list_head *item);
+bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
+ struct mem_cgroup *memcg);
+
+/**
+ * list_lru_del_obj: delete an element from the lru list
+ * @lru: the lru pointer
+ * @item: the item to be deleted.
+ *
+ * This function is similar to list_lru_del(), but the NUMA node and the
+ * memcg of the sublist is determined by @item list_head. This assumption is
+ * valid for slab objects LRU such as dentries, inodes, etc.
+ *
+ * Return value: true if the list was updated, false otherwise.
+ */
+bool list_lru_del_obj(struct list_lru *lru, struct list_head *item);
/**
* list_lru_count_one: return the number of objects currently held by @lru
@@ -108,9 +140,11 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item);
* @nid: the node id to count from.
* @memcg: the cgroup to count from.
*
- * Always return a non-negative number, 0 for empty lists. There is no
- * guarantee that the list is not updated while the count is being computed.
- * Callers that want such a guarantee need to provide an outer lock.
+ * There is no guarantee that the list is not updated while the count is being
+ * computed. Callers that want such a guarantee need to provide an outer lock.
+ *
+ * Return: 0 for empty lists, otherwise the number of objects
+ * currently held by @lru.
*/
unsigned long list_lru_count_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg);
@@ -136,12 +170,28 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
struct list_head *head);
+/**
+ * list_lru_putback: undo list_lru_isolate
+ * @lru: the lru pointer.
+ * @item: the item to put back.
+ * @nid: the node id of the sublist to put the item back to.
+ * @memcg: the cgroup of the sublist to put the item back to.
+ *
+ * Put back an isolated item into its original LRU. Note that unlike
+ * list_lru_add, this does not increment the node LRU count (as
+ * list_lru_isolate does not originally decrement this count).
+ *
+ * Since we might have dropped the LRU lock in between, recompute list_lru_one
+ * from the node's id and memcg.
+ */
+void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid,
+ struct mem_cgroup *memcg);
typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
/**
- * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one: walk a @lru, isolating and disposing freeable items.
* @lru: the lru pointer.
* @nid: the node id to scan from.
* @memcg: the cgroup to scan from.
@@ -150,24 +200,24 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
* @cb_arg: opaque type that will be passed to @isolate
* @nr_to_walk: how many items to scan.
*
- * This function will scan all elements in a particular list_lru, calling the
+ * This function will scan all elements in a particular @lru, calling the
* @isolate callback for each of those items, along with the current list
* spinlock and a caller-provided opaque. The @isolate callback can choose to
* drop the lock internally, but *must* return with the lock held. The callback
- * will return an enum lru_status telling the list_lru infrastructure what to
+ * will return an enum lru_status telling the @lru infrastructure what to
* do with the object being scanned.
*
- * Please note that nr_to_walk does not mean how many objects will be freed,
+ * Please note that @nr_to_walk does not mean how many objects will be freed,
* just how many objects will be scanned.
*
- * Return value: the number of objects effectively removed from the LRU.
+ * Return: the number of objects effectively removed from the LRU.
*/
unsigned long list_lru_walk_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
/**
- * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one_irq: walk a @lru, isolating and disposing freeable items.
* @lru: the lru pointer.
* @nid: the node id to scan from.
* @memcg: the cgroup to scan from.
@@ -176,7 +226,7 @@ unsigned long list_lru_walk_one(struct list_lru *lru,
* @cb_arg: opaque type that will be passed to @isolate
* @nr_to_walk: how many items to scan.
*
- * Same as @list_lru_walk_one except that the spinlock is acquired with
+ * Same as list_lru_walk_one() except that the spinlock is acquired with
* spin_lock_irq().
*/
unsigned long list_lru_walk_one_irq(struct list_lru *lru,
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index d01e850b570f..b3d63123b945 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -256,6 +256,8 @@ struct maple_tree {
struct maple_tree name = MTREE_INIT(name, 0)
#define mtree_lock(mt) spin_lock((&(mt)->ma_lock))
+#define mtree_lock_nested(mas, subclass) \
+ spin_lock_nested((&(mt)->ma_lock), subclass)
#define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock))
/*
@@ -327,6 +329,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index,
void *entry, gfp_t gfp);
void *mtree_erase(struct maple_tree *mt, unsigned long index);
+int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
+int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
+
void mtree_destroy(struct maple_tree *mt);
void __mt_destroy(struct maple_tree *mt);
@@ -345,6 +350,36 @@ static inline bool mtree_empty(const struct maple_tree *mt)
/* Advanced API */
/*
+ * Maple State Status
+ * ma_active means the maple state is pointing to a node and offset and can
+ * continue operating on the tree.
+ * ma_start means we have not searched the tree.
+ * ma_root means we have searched the tree and the entry we found lives in
+ * the root of the tree (ie it has index 0, length 1 and is the only entry in
+ * the tree).
+ * ma_none means we have searched the tree and there is no node in the
+ * tree for this entry. For example, we searched for index 1 in an empty
+ * tree. Or we have a tree which points to a full leaf node and we
+ * searched for an entry which is larger than can be contained in that
+ * leaf node.
+ * ma_pause means the data within the maple state may be stale, restart the
+ * operation
+ * ma_overflow means the search has reached the upper limit of the search
+ * ma_underflow means the search has reached the lower limit of the search
+ * ma_error means there was an error, check the node for the error number.
+ */
+enum maple_status {
+ ma_active,
+ ma_start,
+ ma_root,
+ ma_none,
+ ma_pause,
+ ma_overflow,
+ ma_underflow,
+ ma_error,
+};
+
+/*
* The maple state is defined in the struct ma_state and is used to keep track
* of information during operations, and even between operations when using the
* advanced API.
@@ -376,6 +411,13 @@ static inline bool mtree_empty(const struct maple_tree *mt)
* When returning a value the maple state index and last respectively contain
* the start and end of the range for the entry. Ranges are inclusive in the
* Maple Tree.
+ *
+ * The status of the state is used to determine how the next action should treat
+ * the state. For instance, if the status is ma_start then the next action
+ * should start at the root of the tree and walk down. If the status is
+ * ma_pause then the node may be stale data and should be discarded. If the
+ * status is ma_overflow, then the last action hit the upper limit.
+ *
*/
struct ma_state {
struct maple_tree *tree; /* The tree we're operating in */
@@ -385,9 +427,11 @@ struct ma_state {
unsigned long min; /* The minimum index of this node - implied pivot min */
unsigned long max; /* The maximum index of this node - implied pivot max */
struct maple_alloc *alloc; /* Allocated nodes for this operation */
+ enum maple_status status; /* The status of the state (active, start, none, etc) */
unsigned char depth; /* depth of tree descent during write */
unsigned char offset;
unsigned char mas_flags;
+ unsigned char end; /* The end of the node */
};
struct ma_wr_state {
@@ -397,7 +441,6 @@ struct ma_wr_state {
unsigned long r_max; /* range max */
enum maple_type type; /* mas->node type */
unsigned char offset_end; /* The offset where the write ends */
- unsigned char node_end; /* mas->node end */
unsigned long *pivots; /* mas->node->pivots pointer */
unsigned long end_piv; /* The pivot at the offset end */
void __rcu **slots; /* mas->node->slots pointer */
@@ -406,30 +449,16 @@ struct ma_wr_state {
};
#define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock))
+#define mas_lock_nested(mas, subclass) \
+ spin_lock_nested(&((mas)->tree->ma_lock), subclass)
#define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock))
-
/*
* Special values for ma_state.node.
- * MAS_START means we have not searched the tree.
- * MAS_ROOT means we have searched the tree and the entry we found lives in
- * the root of the tree (ie it has index 0, length 1 and is the only entry in
- * the tree).
- * MAS_NONE means we have searched the tree and there is no node in the
- * tree for this entry. For example, we searched for index 1 in an empty
- * tree. Or we have a tree which points to a full leaf node and we
- * searched for an entry which is larger than can be contained in that
- * leaf node.
* MA_ERROR represents an errno. After dropping the lock and attempting
* to resolve the error, the walk would have to be restarted from the
* top of the tree as the tree may have been modified.
*/
-#define MAS_START ((struct maple_enode *)1UL)
-#define MAS_ROOT ((struct maple_enode *)5UL)
-#define MAS_NONE ((struct maple_enode *)9UL)
-#define MAS_PAUSE ((struct maple_enode *)17UL)
-#define MAS_OVERFLOW ((struct maple_enode *)33UL)
-#define MAS_UNDERFLOW ((struct maple_enode *)65UL)
#define MA_ERROR(err) \
((struct maple_enode *)(((unsigned long)err << 2) | 2UL))
@@ -438,7 +467,8 @@ struct ma_wr_state {
.tree = mt, \
.index = first, \
.last = end, \
- .node = MAS_START, \
+ .node = NULL, \
+ .status = ma_start, \
.min = 0, \
.max = ULONG_MAX, \
.alloc = NULL, \
@@ -469,7 +499,6 @@ void *mas_find_range(struct ma_state *mas, unsigned long max);
void *mas_find_rev(struct ma_state *mas, unsigned long min);
void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
-bool mas_is_err(struct ma_state *mas);
bool mas_nomem(struct ma_state *mas, gfp_t gfp);
void mas_pause(struct ma_state *mas);
@@ -498,28 +527,18 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree,
mas->tree = tree;
mas->index = mas->last = addr;
mas->max = ULONG_MAX;
- mas->node = MAS_START;
-}
-
-/* Checks if a mas has not found anything */
-static inline bool mas_is_none(const struct ma_state *mas)
-{
- return mas->node == MAS_NONE;
+ mas->status = ma_start;
+ mas->node = NULL;
}
-/* Checks if a mas has been paused */
-static inline bool mas_is_paused(const struct ma_state *mas)
+static inline bool mas_is_active(struct ma_state *mas)
{
- return mas->node == MAS_PAUSE;
+ return mas->status == ma_active;
}
-/* Check if the mas is pointing to a node or not */
-static inline bool mas_is_active(struct ma_state *mas)
+static inline bool mas_is_err(struct ma_state *mas)
{
- if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE)
- return true;
-
- return false;
+ return mas->status == ma_error;
}
/**
@@ -532,9 +551,10 @@ static inline bool mas_is_active(struct ma_state *mas)
*
* Context: Any context.
*/
-static inline void mas_reset(struct ma_state *mas)
+static __always_inline void mas_reset(struct ma_state *mas)
{
- mas->node = MAS_START;
+ mas->status = ma_start;
+ mas->node = NULL;
}
/**
@@ -550,6 +570,131 @@ static inline void mas_reset(struct ma_state *mas)
*/
#define mas_for_each(__mas, __entry, __max) \
while (((__entry) = mas_find((__mas), (__max))) != NULL)
+
+#ifdef CONFIG_DEBUG_MAPLE_TREE
+enum mt_dump_format {
+ mt_dump_dec,
+ mt_dump_hex,
+};
+
+extern atomic_t maple_tree_tests_run;
+extern atomic_t maple_tree_tests_passed;
+
+void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
+void mas_dump(const struct ma_state *mas);
+void mas_wr_dump(const struct ma_wr_state *wr_mas);
+void mt_validate(struct maple_tree *mt);
+void mt_cache_shrink(void);
+#define MT_BUG_ON(__tree, __x) do { \
+ atomic_inc(&maple_tree_tests_run); \
+ if (__x) { \
+ pr_info("BUG at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mt_dump(__tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+} while (0)
+
+#define MAS_BUG_ON(__mas, __x) do { \
+ atomic_inc(&maple_tree_tests_run); \
+ if (__x) { \
+ pr_info("BUG at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mas_dump(__mas); \
+ mt_dump((__mas)->tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+} while (0)
+
+#define MAS_WR_BUG_ON(__wrmas, __x) do { \
+ atomic_inc(&maple_tree_tests_run); \
+ if (__x) { \
+ pr_info("BUG at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mas_wr_dump(__wrmas); \
+ mas_dump((__wrmas)->mas); \
+ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+} while (0)
+
+#define MT_WARN_ON(__tree, __x) ({ \
+ int ret = !!(__x); \
+ atomic_inc(&maple_tree_tests_run); \
+ if (ret) { \
+ pr_info("WARN at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mt_dump(__tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+ unlikely(ret); \
+})
+
+#define MAS_WARN_ON(__mas, __x) ({ \
+ int ret = !!(__x); \
+ atomic_inc(&maple_tree_tests_run); \
+ if (ret) { \
+ pr_info("WARN at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mas_dump(__mas); \
+ mt_dump((__mas)->tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+ unlikely(ret); \
+})
+
+#define MAS_WR_WARN_ON(__wrmas, __x) ({ \
+ int ret = !!(__x); \
+ atomic_inc(&maple_tree_tests_run); \
+ if (ret) { \
+ pr_info("WARN at %s:%d (%u)\n", \
+ __func__, __LINE__, __x); \
+ mas_wr_dump(__wrmas); \
+ mas_dump((__wrmas)->mas); \
+ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \
+ pr_info("Pass: %u Run:%u\n", \
+ atomic_read(&maple_tree_tests_passed), \
+ atomic_read(&maple_tree_tests_run)); \
+ dump_stack(); \
+ } else { \
+ atomic_inc(&maple_tree_tests_passed); \
+ } \
+ unlikely(ret); \
+})
+#else
+#define MT_BUG_ON(__tree, __x) BUG_ON(__x)
+#define MAS_BUG_ON(__mas, __x) BUG_ON(__x)
+#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x)
+#define MT_WARN_ON(__tree, __x) WARN_ON(__x)
+#define MAS_WARN_ON(__mas, __x) WARN_ON(__x)
+#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x)
+#endif /* CONFIG_DEBUG_MAPLE_TREE */
+
/**
* __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
* current location.
@@ -563,6 +708,9 @@ static inline void mas_reset(struct ma_state *mas)
static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
unsigned long last)
{
+ /* Ensure the range starts within the current slot */
+ MAS_WARN_ON(mas, mas_is_active(mas) &&
+ (mas->index > start || mas->last < start));
mas->index = start;
mas->last = last;
}
@@ -580,8 +728,8 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
static inline
void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
{
+ mas_reset(mas);
__mas_set_range(mas, start, last);
- mas->node = MAS_START;
}
/**
@@ -706,129 +854,4 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);
for (__entry = mt_find(__tree, &(__index), __max); \
__entry; __entry = mt_find_after(__tree, &(__index), __max))
-
-#ifdef CONFIG_DEBUG_MAPLE_TREE
-enum mt_dump_format {
- mt_dump_dec,
- mt_dump_hex,
-};
-
-extern atomic_t maple_tree_tests_run;
-extern atomic_t maple_tree_tests_passed;
-
-void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
-void mas_dump(const struct ma_state *mas);
-void mas_wr_dump(const struct ma_wr_state *wr_mas);
-void mt_validate(struct maple_tree *mt);
-void mt_cache_shrink(void);
-#define MT_BUG_ON(__tree, __x) do { \
- atomic_inc(&maple_tree_tests_run); \
- if (__x) { \
- pr_info("BUG at %s:%d (%u)\n", \
- __func__, __LINE__, __x); \
- mt_dump(__tree, mt_dump_hex); \
- pr_info("Pass: %u Run:%u\n", \
- atomic_read(&maple_tree_tests_passed), \
- atomic_read(&maple_tree_tests_run)); \
- dump_stack(); \
- } else { \
- atomic_inc(&maple_tree_tests_passed); \
- } \
-} while (0)
-
-#define MAS_BUG_ON(__mas, __x) do { \
- atomic_inc(&maple_tree_tests_run); \
- if (__x) { \
- pr_info("BUG at %s:%d (%u)\n", \
- __func__, __LINE__, __x); \
- mas_dump(__mas); \
- mt_dump((__mas)->tree, mt_dump_hex); \
- pr_info("Pass: %u Run:%u\n", \
- atomic_read(&maple_tree_tests_passed), \
- atomic_read(&maple_tree_tests_run)); \
- dump_stack(); \
- } else { \
- atomic_inc(&maple_tree_tests_passed); \
- } \
-} while (0)
-
-#define MAS_WR_BUG_ON(__wrmas, __x) do { \
- atomic_inc(&maple_tree_tests_run); \
- if (__x) { \
- pr_info("BUG at %s:%d (%u)\n", \
- __func__, __LINE__, __x); \
- mas_wr_dump(__wrmas); \
- mas_dump((__wrmas)->mas); \
- mt_dump((__wrmas)->mas->tree, mt_dump_hex); \
- pr_info("Pass: %u Run:%u\n", \
- atomic_read(&maple_tree_tests_passed), \
- atomic_read(&maple_tree_tests_run)); \
- dump_stack(); \
- } else { \
- atomic_inc(&maple_tree_tests_passed); \
- } \
-} while (0)
-
-#define MT_WARN_ON(__tree, __x) ({ \
- int ret = !!(__x); \
- atomic_inc(&maple_tree_tests_run); \
- if (ret) { \
- pr_info("WARN at %s:%d (%u)\n", \
- __func__, __LINE__, __x); \
- mt_dump(__tree, mt_dump_hex); \
- pr_info("Pass: %u Run:%u\n", \
- atomic_read(&maple_tree_tests_passed), \
- atomic_read(&maple_tree_tests_run)); \
- dump_stack(); \
- } else { \
- atomic_inc(&maple_tree_tests_passed); \
- } \
- unlikely(ret); \
-})
-
-#define MAS_WARN_ON(__mas, __x) ({ \
- int ret = !!(__x); \
- atomic_inc(&maple_tree_tests_run); \
- if (ret) { \
- pr_info("WARN at %s:%d (%u)\n", \
- __func__, __LINE__, __x); \
- mas_dump(__mas); \
- mt_dump((__mas)->tree, mt_dump_hex); \
- pr_info("Pass: %u Run:%u\n", \
- atomic_read(&maple_tree_tests_passed), \
- atomic_read(&maple_tree_tests_run)); \
- dump_stack(); \
- } else { \
- atomic_inc(&maple_tree_tests_passed); \
- } \
- unlikely(ret); \
-})
-
-#define MAS_WR_WARN_ON(__wrmas, __x) ({ \
- int ret = !!(__x); \
- atomic_inc(&maple_tree_tests_run); \
- if (ret) { \
- pr_info("WARN at %s:%d (%u)\n", \
- __func__, __LINE__, __x); \
- mas_wr_dump(__wrmas); \
- mas_dump((__wrmas)->mas); \
- mt_dump((__wrmas)->mas->tree, mt_dump_hex); \
- pr_info("Pass: %u Run:%u\n", \
- atomic_read(&maple_tree_tests_passed), \
- atomic_read(&maple_tree_tests_run)); \
- dump_stack(); \
- } else { \
- atomic_inc(&maple_tree_tests_passed); \
- } \
- unlikely(ret); \
-})
-#else
-#define MT_BUG_ON(__tree, __x) BUG_ON(__x)
-#define MAS_BUG_ON(__mas, __x) BUG_ON(__x)
-#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x)
-#define MT_WARN_ON(__tree, __x) WARN_ON(__x)
-#define MAS_WARN_ON(__mas, __x) WARN_ON(__x)
-#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x)
-#endif /* CONFIG_DEBUG_MAPLE_TREE */
-
#endif /*_LINUX_MAPLE_TREE_H */
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ae3bde302f70..b695f9e946da 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -123,6 +123,7 @@ int memblock_physmem_add(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
bool memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size);
+bool memblock_validate_numa_coverage(unsigned long threshold_bytes);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7bdcf3020d7a..20ff87f8e001 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -219,6 +219,12 @@ struct mem_cgroup {
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
unsigned long zswap_max;
+
+ /*
+ * Prevent pages from this memcg from being written back from zswap to
+ * swap, and from being swapped out on zswap store failures.
+ */
+ bool zswap_writeback;
#endif
unsigned long soft_limit;
@@ -324,7 +330,7 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
#endif
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
/* per-memcg mm_struct list */
struct lru_gen_mm_list mm_list;
#endif
@@ -821,6 +827,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
return !memcg || css_tryget(&memcg->css);
}
+static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
+{
+ return !memcg || css_tryget_online(&memcg->css);
+}
+
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
if (memcg)
@@ -1046,8 +1057,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
-void mem_cgroup_flush_stats(void);
-void mem_cgroup_flush_stats_ratelimited(void);
+void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
+void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val);
@@ -1187,6 +1198,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
return NULL;
}
+static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
+{
+ return NULL;
+}
+
static inline bool folio_memcg_kmem(struct folio *folio)
{
return false;
@@ -1349,6 +1365,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
return true;
}
+static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
+{
+ return true;
+}
+
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}
@@ -1548,11 +1569,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
}
-static inline void mem_cgroup_flush_stats(void)
+static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
}
-static inline void mem_cgroup_flush_stats_ratelimited(void)
+static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
}
@@ -1926,6 +1947,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
+bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
#else
static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
{
@@ -1939,6 +1961,11 @@ static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
size_t size)
{
}
+static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
+{
+ /* if zswap is disabled, do not block pages going to the swapping device */
+ return true;
+}
#endif
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 4aae6c06c5f2..7be1e32e6d42 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -51,6 +51,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
extern int mempool_resize(mempool_t *pool, int new_min_nr);
extern void mempool_destroy(mempool_t *pool);
extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
+extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc;
extern void mempool_free(void *element, mempool_t *pool);
/*
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 1314d9c5f05b..744c830f4b13 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -196,8 +196,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
struct dev_pagemap *pgmap);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);
-unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
-void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
unsigned long memremap_compat_align(void);
#else
static inline void *devm_memremap_pages(struct device *dev,
@@ -228,16 +226,6 @@ static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
return false;
}
-static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
-{
- return 0;
-}
-
-static inline void vmem_altmap_free(struct vmem_altmap *altmap,
- unsigned long nr_pfns)
-{
-}
-
/* when memremap_pages() is disabled all archs can remap a single page */
static inline unsigned long memremap_compat_align(void)
{
diff --git a/include/linux/mm.h b/include/linux/mm.h
index da5219b48d52..896c0079f64f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -994,6 +994,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
return mas_expected_entries(&vmi->mas, count);
}
+static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
+ unsigned long start, unsigned long end, gfp_t gfp)
+{
+ __mas_set_range(&vmi->mas, start, end - 1);
+ mas_store_gfp(&vmi->mas, NULL, gfp);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
+
+ return 0;
+}
+
/* Free any unused preallocations */
static inline void vma_iter_free(struct vma_iterator *vmi)
{
@@ -1804,7 +1815,7 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
static inline u8 page_kasan_tag(const struct page *page)
{
- u8 tag = 0xff;
+ u8 tag = KASAN_TAG_KERNEL;
if (kasan_enabled()) {
tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
@@ -1833,7 +1844,7 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag)
static inline void page_kasan_tag_reset(struct page *page)
{
if (kasan_enabled())
- page_kasan_tag_set(page, 0xff);
+ page_kasan_tag_set(page, KASAN_TAG_KERNEL);
}
#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
@@ -1953,15 +1964,15 @@ static inline bool page_maybe_dma_pinned(struct page *page)
*
* The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
*/
-static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
- struct page *page)
+static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
+ struct folio *folio)
{
VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));
if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
return false;
- return page_maybe_dma_pinned(page);
+ return folio_maybe_dma_pinned(folio);
}
/**
@@ -2373,7 +2384,8 @@ extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
-int generic_error_remove_page(struct address_space *mapping, struct page *page);
+int generic_error_remove_folio(struct address_space *mapping,
+ struct folio *folio);
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
unsigned long address, struct pt_regs *regs);
@@ -3859,6 +3871,32 @@ void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap);
#endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ /* number of pfns from base where pfn_to_page() is valid */
+ if (altmap)
+ return altmap->reserve + altmap->free;
+ return 0;
+}
+
+static inline void vmem_altmap_free(struct vmem_altmap *altmap,
+ unsigned long nr_pfns)
+{
+ altmap->alloc -= nr_pfns;
+}
+#else
+static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ return 0;
+}
+
+static inline void vmem_altmap_free(struct vmem_altmap *altmap,
+ unsigned long nr_pfns)
+{
+}
+#endif
+
#define VMEMMAP_RESERVE_NR 2
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 950df415d7de..4dd996ad0bd3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -401,11 +401,11 @@ FOLIO_MATCH(compound_head, _head_2a);
* @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
* @__page_mapping: Aliases with page->mapping. Unused for page tables.
* @pt_mm: Used for x86 pgds.
- * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only.
+ * @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
* @_pt_pad_2: Padding to ensure proper alignment.
* @ptl: Lock for the page table.
* @__page_type: Same as page->page_type. Unused for page tables.
- * @_refcount: Same as page refcount. Used for s390 page tables.
+ * @__page_refcount: Same as page refcount.
* @pt_memcg_data: Memcg data. Tracked for page tables here.
*
* This struct overlays struct page for now. Do not modify without a good
@@ -438,7 +438,7 @@ struct ptdesc {
#endif
};
unsigned int __page_type;
- atomic_t _refcount;
+ atomic_t __page_refcount;
#ifdef CONFIG_MEMCG
unsigned long pt_memcg_data;
#endif
@@ -452,7 +452,7 @@ TABLE_MATCH(compound_head, _pt_pad_1);
TABLE_MATCH(mapping, __page_mapping);
TABLE_MATCH(rcu_head, pt_rcu_head);
TABLE_MATCH(page_type, __page_type);
-TABLE_MATCH(_refcount, _refcount);
+TABLE_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
TABLE_MATCH(memcg_data, pt_memcg_data);
#endif
@@ -961,7 +961,7 @@ struct mm_struct {
*/
unsigned long ksm_zero_pages;
#endif /* CONFIG_KSM */
-#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
struct {
/* this mm_struct is on lru_gen_mm_list */
struct list_head list;
@@ -976,7 +976,7 @@ struct mm_struct {
struct mem_cgroup *memcg;
#endif
} lru_gen;
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_GEN_WALKS_MMU */
} __randomize_layout;
/*
@@ -1014,11 +1014,13 @@ struct lru_gen_mm_list {
spinlock_t lock;
};
+#endif /* CONFIG_LRU_GEN */
+
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
+
void lru_gen_add_mm(struct mm_struct *mm);
void lru_gen_del_mm(struct mm_struct *mm);
-#ifdef CONFIG_MEMCG
void lru_gen_migrate_mm(struct mm_struct *mm);
-#endif
static inline void lru_gen_init_mm(struct mm_struct *mm)
{
@@ -1039,7 +1041,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
WRITE_ONCE(mm->lru_gen.bitmap, -1);
}
-#else /* !CONFIG_LRU_GEN */
+#else /* !CONFIG_LRU_GEN_WALKS_MMU */
static inline void lru_gen_add_mm(struct mm_struct *mm)
{
@@ -1049,11 +1051,9 @@ static inline void lru_gen_del_mm(struct mm_struct *mm)
{
}
-#ifdef CONFIG_MEMCG
static inline void lru_gen_migrate_mm(struct mm_struct *mm)
{
}
-#endif
static inline void lru_gen_init_mm(struct mm_struct *mm)
{
@@ -1063,7 +1063,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm)
{
}
-#endif /* CONFIG_LRU_GEN */
+#endif /* CONFIG_LRU_GEN_WALKS_MMU */
struct vma_iterator {
struct ma_state mas;
@@ -1074,7 +1074,8 @@ struct vma_iterator {
.mas = { \
.tree = &(__mm)->mm_mt, \
.index = __addr, \
- .node = MAS_START, \
+ .node = NULL, \
+ .status = ma_start, \
}, \
}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9db36e197712..4ed33b127821 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -22,18 +22,21 @@
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
+#include <linux/zswap.h>
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
-#define MAX_ORDER 10
+#define MAX_PAGE_ORDER 10
#else
-#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
+#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
-#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
+#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)
#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
+#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)
+
/*
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
* costly to service. That is between allocation orders which should
@@ -95,7 +98,7 @@ static inline bool migratetype_is_mergeable(int mt)
}
#define for_each_migratetype_order(order, type) \
- for (order = 0; order <= MAX_ORDER; order++) \
+ for (order = 0; order < NR_PAGE_ORDERS; order++) \
for (type = 0; type < MIGRATE_TYPES; type++)
extern int page_group_by_mobility_disabled;
@@ -207,6 +210,10 @@ enum node_stat_item {
PGPROMOTE_SUCCESS, /* promote successfully */
PGPROMOTE_CANDIDATE, /* candidate pages to promote */
#endif
+ /* PGDEMOTE_*: pages demoted */
+ PGDEMOTE_KSWAPD,
+ PGDEMOTE_DIRECT,
+ PGDEMOTE_KHUGEPAGED,
NR_VM_NODE_STAT_ITEMS
};
@@ -435,14 +442,12 @@ struct lru_gen_folio {
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
/* whether the multi-gen LRU is enabled */
bool enabled;
-#ifdef CONFIG_MEMCG
/* the memcg generation this lru_gen_folio belongs to */
u8 gen;
/* the list segment this lru_gen_folio belongs to */
u8 seg;
/* per-node lru_gen_folio list for global reclaim */
struct hlist_nulls_node list;
-#endif
};
enum {
@@ -488,11 +493,6 @@ struct lru_gen_mm_walk {
bool force_scan;
};
-void lru_gen_init_lruvec(struct lruvec *lruvec);
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
-
-#ifdef CONFIG_MEMCG
-
/*
* For each node, memcgs are divided into two generations: the old and the
* young. For each generation, memcgs are randomly sharded into multiple bins
@@ -550,6 +550,8 @@ struct lru_gen_memcg {
};
void lru_gen_init_pgdat(struct pglist_data *pgdat);
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
@@ -558,19 +560,6 @@ void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);
-#else /* !CONFIG_MEMCG */
-
-#define MEMCG_NR_GENS 1
-
-struct lru_gen_memcg {
-};
-
-static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
-{
-}
-
-#endif /* CONFIG_MEMCG */
-
#else /* !CONFIG_LRU_GEN */
static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
@@ -585,8 +574,6 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
}
-#ifdef CONFIG_MEMCG
-
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}
@@ -611,8 +598,6 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
}
-#endif /* CONFIG_MEMCG */
-
#endif /* CONFIG_LRU_GEN */
struct lruvec {
@@ -635,12 +620,15 @@ struct lruvec {
#ifdef CONFIG_LRU_GEN
/* evictable pages divided into generations */
struct lru_gen_folio lrugen;
+#ifdef CONFIG_LRU_GEN_WALKS_MMU
/* to concurrently iterate lru_gen_mm_list */
struct lru_gen_mm_state mm_state;
#endif
+#endif /* CONFIG_LRU_GEN */
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
+ struct zswap_lruvec_state zswap_lruvec_state;
};
/* Isolate for asynchronous migration */
@@ -947,10 +935,10 @@ struct zone {
CACHELINE_PADDING(_pad1_);
/* free areas of different sizes */
- struct free_area free_area[MAX_ORDER + 1];
+ struct free_area free_area[NR_PAGE_ORDERS];
#ifdef CONFIG_UNACCEPTED_MEMORY
- /* Pages to be accepted. All pages on the list are MAX_ORDER */
+ /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
struct list_head unaccepted_pages;
#endif
@@ -1760,8 +1748,8 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
#define SECTION_BLOCKFLAGS_BITS \
((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
-#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
-#error Allocator MAX_ORDER exceeds SECTION_SIZE
+#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
+#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
#endif
static inline unsigned long pfn_to_section_nr(unsigned long pfn)
@@ -1793,6 +1781,7 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
struct mem_section_usage {
+ struct rcu_head rcu;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
@@ -1986,7 +1975,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
int idx = subsection_map_index(pfn);
- return test_bit(idx, ms->usage->subsection_map);
+ return test_bit(idx, READ_ONCE(ms->usage)->subsection_map);
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
@@ -2010,6 +1999,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
static inline int pfn_valid(unsigned long pfn)
{
struct mem_section *ms;
+ int ret;
/*
* Ensure the upper PAGE_SHIFT bits are clear in the
@@ -2023,13 +2013,19 @@ static inline int pfn_valid(unsigned long pfn)
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
ms = __pfn_to_section(pfn);
- if (!valid_section(ms))
+ rcu_read_lock();
+ if (!valid_section(ms)) {
+ rcu_read_unlock();
return 0;
+ }
/*
* Traditionally early sections always returned pfn_valid() for
* the entire section-sized span.
*/
- return early_section(ms) || pfn_section_valid(ms, pfn);
+ ret = early_section(ms) || pfn_section_valid(ms, pfn);
+ rcu_read_unlock();
+
+ return ret;
}
#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a88e64acebfe..735cddc13d20 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -772,19 +772,14 @@ static __always_inline void SetPageUptodate(struct page *page)
CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)
-bool __folio_start_writeback(struct folio *folio, bool keep_write);
-bool set_page_writeback(struct page *page);
+void __folio_start_writeback(struct folio *folio, bool keep_write);
+void set_page_writeback(struct page *page);
#define folio_start_writeback(folio) \
__folio_start_writeback(folio, false)
#define folio_start_writeback_keepwrite(folio) \
__folio_start_writeback(folio, true)
-static inline bool test_set_page_writeback(struct page *page)
-{
- return set_page_writeback(page);
-}
-
static __always_inline bool folio_test_head(struct folio *folio)
{
return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY));
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e83c4c095041..3f2409b968ec 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,14 +41,14 @@ extern unsigned int pageblock_order;
* Huge pages are a constant size, but don't exceed the maximum allocation
* granularity.
*/
-#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER)
+#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
#else /* CONFIG_HUGETLB_PAGE */
/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
-#define pageblock_order MAX_ORDER
+#define pageblock_order MAX_PAGE_ORDER
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index af7639c3b0a3..466cf477551a 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -184,6 +184,13 @@ static inline int pmd_young(pmd_t pmd)
}
#endif
+#ifndef pmd_dirty
+static inline int pmd_dirty(pmd_t pmd)
+{
+ return 0;
+}
+#endif
+
/*
* A facility to provide lazy MMU batching. This allows PTE updates and
* page invalidations to be delayed until a call to leave lazy MMU mode
@@ -375,7 +382,7 @@ static inline bool arch_has_hw_nonleaf_pmd_young(void)
*/
static inline bool arch_has_hw_pte_young(void)
{
- return false;
+ return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG);
}
#endif
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b26fe858fd44..b7944a833668 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -121,6 +121,11 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
down_write(&anon_vma->root->rwsem);
}
+static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
+{
+ return down_write_trylock(&anon_vma->root->rwsem);
+}
+
static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
{
up_write(&anon_vma->root->rwsem);
@@ -172,133 +177,323 @@ struct anon_vma *folio_get_anon_vma(struct folio *folio);
typedef int __bitwise rmap_t;
/*
- * No special request: if the page is a subpage of a compound page, it is
- * mapped via a PTE. The mapped (sub)page is possibly shared between processes.
+ * No special request: A mapped anonymous (sub)page is possibly shared between
+ * processes.
*/
#define RMAP_NONE ((__force rmap_t)0)
-/* The (sub)page is exclusive to a single process. */
+/* The anonymous (sub)page is exclusive to a single process. */
#define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0))
/*
- * The compound page is not mapped via PTEs, but instead via a single PMD and
- * should be accounted accordingly.
+ * Internally, we're using an enum to specify the granularity. We make the
+ * compiler emit specialized code for each granularity.
*/
-#define RMAP_COMPOUND ((__force rmap_t)BIT(1))
+enum rmap_level {
+ RMAP_LEVEL_PTE = 0,
+ RMAP_LEVEL_PMD,
+};
+
+static inline void __folio_rmap_sanity_checks(struct folio *folio,
+ struct page *page, int nr_pages, enum rmap_level level)
+{
+ /* hugetlb folios are handled separately. */
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
+
+ /*
+ * TODO: we get driver-allocated folios that have nothing to do with
+ * the rmap using vm_insert_page(); therefore, we cannot assume that
+ * folio_test_large_rmappable() holds for large folios. We should
+ * handle any desired mapcount+stats accounting for these folios in
+ * VM_MIXEDMAP VMAs separately, and then sanity-check here that
+ * we really only get rmappable folios.
+ */
+
+ VM_WARN_ON_ONCE(nr_pages <= 0);
+ VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
+ VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
+
+ switch (level) {
+ case RMAP_LEVEL_PTE:
+ break;
+ case RMAP_LEVEL_PMD:
+ /*
+ * We don't support folios larger than a single PMD yet. So
+ * when RMAP_LEVEL_PMD is set, we assume that we are creating
+ * a single "entire" mapping of the folio.
+ */
+ VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
+ VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
+ break;
+ default:
+ VM_WARN_ON_ONCE(true);
+ }
+}
/*
* rmap interfaces called when adding or removing pte of page
*/
void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
-void page_add_anon_rmap(struct page *, struct vm_area_struct *,
- unsigned long address, rmap_t flags);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
- unsigned long address);
+void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
+ struct vm_area_struct *, unsigned long address, rmap_t flags);
+#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
+ folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
+void folio_add_anon_rmap_pmd(struct folio *, struct page *,
+ struct vm_area_struct *, unsigned long address, rmap_t flags);
void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
unsigned long address);
-void page_add_file_rmap(struct page *, struct vm_area_struct *,
- bool compound);
-void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
- struct vm_area_struct *, bool compound);
-void page_remove_rmap(struct page *, struct vm_area_struct *,
- bool compound);
-
-void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *,
+void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
+ struct vm_area_struct *);
+#define folio_add_file_rmap_pte(folio, page, vma) \
+ folio_add_file_rmap_ptes(folio, page, 1, vma)
+void folio_add_file_rmap_pmd(struct folio *, struct page *,
+ struct vm_area_struct *);
+void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
+ struct vm_area_struct *);
+#define folio_remove_rmap_pte(folio, page, vma) \
+ folio_remove_rmap_ptes(folio, page, 1, vma)
+void folio_remove_rmap_pmd(struct folio *, struct page *,
+ struct vm_area_struct *);
+
+void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
unsigned long address, rmap_t flags);
-void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
+void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
unsigned long address);
-static inline void __page_dup_rmap(struct page *page, bool compound)
+/* See folio_try_dup_anon_rmap_*() */
+static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
+ struct vm_area_struct *vma)
{
- if (compound) {
- struct folio *folio = (struct folio *)page;
+ VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+ VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- VM_BUG_ON_PAGE(compound && !PageHead(page), page);
- atomic_inc(&folio->_entire_mapcount);
- } else {
- atomic_inc(&page->_mapcount);
+ if (PageAnonExclusive(&folio->page)) {
+ if (unlikely(folio_needs_cow_for_dma(vma, folio)))
+ return -EBUSY;
+ ClearPageAnonExclusive(&folio->page);
}
+ atomic_inc(&folio->_entire_mapcount);
+ return 0;
+}
+
+/* See folio_try_share_anon_rmap_*() */
+static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
+{
+ VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+ VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+ VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);
+
+ /* Paired with the memory barrier in try_grab_folio(). */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb();
+
+ if (unlikely(folio_maybe_dma_pinned(folio)))
+ return -EBUSY;
+ ClearPageAnonExclusive(&folio->page);
+
+ /*
+ * This is conceptually a smp_wmb() paired with the smp_rmb() in
+ * gup_must_unshare().
+ */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+ smp_mb__after_atomic();
+ return 0;
+}
+
+static inline void hugetlb_add_file_rmap(struct folio *folio)
+{
+ VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+ VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
+
+ atomic_inc(&folio->_entire_mapcount);
+}
+
+static inline void hugetlb_remove_rmap(struct folio *folio)
+{
+ VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
+
+ atomic_dec(&folio->_entire_mapcount);
}
-static inline void page_dup_file_rmap(struct page *page, bool compound)
+static __always_inline void __folio_dup_file_rmap(struct folio *folio,
+ struct page *page, int nr_pages, enum rmap_level level)
{
- __page_dup_rmap(page, compound);
+ __folio_rmap_sanity_checks(folio, page, nr_pages, level);
+
+ switch (level) {
+ case RMAP_LEVEL_PTE:
+ do {
+ atomic_inc(&page->_mapcount);
+ } while (page++, --nr_pages > 0);
+ break;
+ case RMAP_LEVEL_PMD:
+ atomic_inc(&folio->_entire_mapcount);
+ break;
+ }
}
/**
- * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped
- * anonymous page
- * @page: the page to duplicate the mapping for
- * @compound: the page is mapped as compound or as a small page
- * @vma: the source vma
+ * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
+ * @folio: The folio to duplicate the mappings of
+ * @page: The first page to duplicate the mappings of
+ * @nr_pages: The number of pages of which the mapping will be duplicated
*
- * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq.
+ * The page range of the folio is defined by [page, page + nr_pages)
*
- * Duplicating the mapping can only fail if the page may be pinned; device
- * private pages cannot get pinned and consequently this function cannot fail.
+ * The caller needs to hold the page table lock.
+ */
+static inline void folio_dup_file_rmap_ptes(struct folio *folio,
+ struct page *page, int nr_pages)
+{
+ __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE);
+}
+#define folio_dup_file_rmap_pte(folio, page) \
+ folio_dup_file_rmap_ptes(folio, page, 1)
+
+/**
+ * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
+ * @folio: The folio to duplicate the mapping of
+ * @page: The first page to duplicate the mapping of
*
- * If duplicating the mapping succeeds, the page has to be mapped R/O into
- * the parent and the child. It must *not* get mapped writable after this call.
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
*
- * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
+ * The caller needs to hold the page table lock.
*/
-static inline int page_try_dup_anon_rmap(struct page *page, bool compound,
- struct vm_area_struct *vma)
+static inline void folio_dup_file_rmap_pmd(struct folio *folio,
+ struct page *page)
{
- VM_BUG_ON_PAGE(!PageAnon(page), page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
- /*
- * No need to check+clear for already shared pages, including KSM
- * pages.
- */
- if (!PageAnonExclusive(page))
- goto dup;
+static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
+ struct page *page, int nr_pages, struct vm_area_struct *src_vma,
+ enum rmap_level level)
+{
+ bool maybe_pinned;
+ int i;
+
+ VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+ __folio_rmap_sanity_checks(folio, page, nr_pages, level);
/*
- * If this page may have been pinned by the parent process,
- * don't allow to duplicate the mapping but instead require to e.g.,
- * copy the page immediately for the child so that we'll always
- * guarantee the pinned page won't be randomly replaced in the
+ * If this folio may have been pinned by the parent process,
+ * don't allow to duplicate the mappings but instead require to e.g.,
+ * copy the subpage immediately for the child so that we'll always
+ * guarantee the pinned folio won't be randomly replaced in the
* future on write faults.
*/
- if (likely(!is_device_private_page(page) &&
- unlikely(page_needs_cow_for_dma(vma, page))))
- return -EBUSY;
+ maybe_pinned = likely(!folio_is_device_private(folio)) &&
+ unlikely(folio_needs_cow_for_dma(src_vma, folio));
- ClearPageAnonExclusive(page);
/*
- * It's okay to share the anon page between both processes, mapping
- * the page R/O into both processes.
+ * No need to check+clear for already shared PTEs/PMDs of the
+ * folio. But if any page is PageAnonExclusive, we must fallback to
+ * copying if the folio maybe pinned.
*/
-dup:
- __page_dup_rmap(page, compound);
+ switch (level) {
+ case RMAP_LEVEL_PTE:
+ if (unlikely(maybe_pinned)) {
+ for (i = 0; i < nr_pages; i++)
+ if (PageAnonExclusive(page + i))
+ return -EBUSY;
+ }
+ do {
+ if (PageAnonExclusive(page))
+ ClearPageAnonExclusive(page);
+ atomic_inc(&page->_mapcount);
+ } while (page++, --nr_pages > 0);
+ break;
+ case RMAP_LEVEL_PMD:
+ if (PageAnonExclusive(page)) {
+ if (unlikely(maybe_pinned))
+ return -EBUSY;
+ ClearPageAnonExclusive(page);
+ }
+ atomic_inc(&folio->_entire_mapcount);
+ break;
+ }
return 0;
}
/**
- * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly
- * shared to prepare for KSM or temporary unmapping
- * @page: the exclusive anonymous page to try marking possibly shared
+ * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
+ * of a folio
+ * @folio: The folio to duplicate the mappings of
+ * @page: The first page to duplicate the mappings of
+ * @nr_pages: The number of pages of which the mapping will be duplicated
+ * @src_vma: The vm area from which the mappings are duplicated
+ *
+ * The page range of the folio is defined by [page, page + nr_pages)
*
- * The caller needs to hold the PT lock and has to have the page table entry
- * cleared/invalidated.
+ * The caller needs to hold the page table lock and the
+ * vma->vma_mm->write_protect_seq.
*
- * This is similar to page_try_dup_anon_rmap(), however, not used during fork()
- * to duplicate a mapping, but instead to prepare for KSM or temporarily
- * unmapping a page (swap, migration) via page_remove_rmap().
+ * Duplicating the mappings can only fail if the folio may be pinned; device
+ * private folios cannot get pinned and consequently this function cannot fail
+ * for them.
*
- * Marking the page shared can only fail if the page may be pinned; device
- * private pages cannot get pinned and consequently this function cannot fail.
+ * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
+ * the parent and the child. They must *not* be writable after this call
+ * succeeded.
*
- * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY
- * otherwise.
+ * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
*/
-static inline int page_try_share_anon_rmap(struct page *page)
+static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
+ struct page *page, int nr_pages, struct vm_area_struct *src_vma)
{
- VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page);
+ return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma,
+ RMAP_LEVEL_PTE);
+}
+#define folio_try_dup_anon_rmap_pte(folio, page, vma) \
+ folio_try_dup_anon_rmap_ptes(folio, page, 1, vma)
- /* device private pages cannot get pinned via GUP. */
- if (unlikely(is_device_private_page(page))) {
+/**
+ * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
+ * of a folio
+ * @folio: The folio to duplicate the mapping of
+ * @page: The first page to duplicate the mapping of
+ * @src_vma: The vm area from which the mapping is duplicated
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
+ *
+ * The caller needs to hold the page table lock and the
+ * vma->vma_mm->write_protect_seq.
+ *
+ * Duplicating the mapping can only fail if the folio may be pinned; device
+ * private folios cannot get pinned and consequently this function cannot fail
+ * for them.
+ *
+ * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
+ * the parent and the child. They must *not* be writable after this call
+ * succeeded.
+ *
+ * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
+ */
+static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
+ struct page *page, struct vm_area_struct *src_vma)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma,
+ RMAP_LEVEL_PMD);
+#else
+ WARN_ON_ONCE(true);
+ return -EBUSY;
+#endif
+}
+
+static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
+ struct page *page, int nr_pages, enum rmap_level level)
+{
+ VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
+ VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
+ __folio_rmap_sanity_checks(folio, page, nr_pages, level);
+
+ /* device private folios cannot get pinned via GUP. */
+ if (unlikely(folio_is_device_private(folio))) {
ClearPageAnonExclusive(page);
return 0;
}
@@ -349,7 +544,7 @@ static inline int page_try_share_anon_rmap(struct page *page)
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
smp_mb();
- if (unlikely(page_maybe_dma_pinned(page)))
+ if (unlikely(folio_maybe_dma_pinned(folio)))
return -EBUSY;
ClearPageAnonExclusive(page);
@@ -362,6 +557,68 @@ static inline int page_try_share_anon_rmap(struct page *page)
return 0;
}
+/**
+ * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
+ * mapped by a PTE possibly shared to prepare
+ * for KSM or temporary unmapping
+ * @folio: The folio to share a mapping of
+ * @page: The mapped exclusive page
+ *
+ * The caller needs to hold the page table lock and has to have the page table
+ * entries cleared/invalidated.
+ *
+ * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
+ * fork() to duplicate mappings, but instead to prepare for KSM or temporarily
+ * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
+ *
+ * Marking the mapped page shared can only fail if the folio maybe pinned;
+ * device private folios cannot get pinned and consequently this function cannot
+ * fail.
+ *
+ * Returns 0 if marking the mapped page possibly shared succeeded. Returns
+ * -EBUSY otherwise.
+ */
+static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
+ struct page *page)
+{
+ return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE);
+}
+
+/**
+ * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
+ * range mapped by a PMD possibly shared to
+ * prepare for temporary unmapping
+ * @folio: The folio to share the mapping of
+ * @page: The first page to share the mapping of
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
+ *
+ * The caller needs to hold the page table lock and has to have the page table
+ * entries cleared/invalidated.
+ *
+ * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
+ * fork() to duplicate a mapping, but instead to prepare for temporarily
+ * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
+ *
+ * Marking the mapped pages shared can only fail if the folio maybe pinned;
+ * device private folios cannot get pinned and consequently this function cannot
+ * fail.
+ *
+ * Returns 0 if marking the mapped pages possibly shared succeeded. Returns
+ * -EBUSY otherwise.
+ */
+static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
+ struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
+ RMAP_LEVEL_PMD);
+#else
+ WARN_ON_ONCE(true);
+ return -EBUSY;
+#endif
+}
+
/*
* Called from mm/vmscan.c to handle paging out
*/
diff --git a/include/linux/slab.h b/include/linux/slab.h
index b2015d0e01ad..b5f5ee8308d0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -307,7 +307,7 @@ static inline unsigned int arch_slab_minalign(void)
* (PAGE_SIZE*2). Larger requests are passed to the page allocator.
*/
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT)
+#define KMALLOC_SHIFT_MAX (MAX_PAGE_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 3
#endif
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index e58306783d8e..adcbb8f23600 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -11,8 +11,6 @@
* SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free
* stack traces often repeat, using stack depot allows to save about 100x space.
*
- * Stack traces are never removed from the stack depot.
- *
* Author: Alexander Potapenko <glider@google.com>
* Copyright (C) 2016 Google, Inc.
*
@@ -32,6 +30,18 @@ typedef u32 depot_stack_handle_t;
*/
#define STACK_DEPOT_EXTRA_BITS 5
+typedef u32 depot_flags_t;
+
+/*
+ * Flags that can be passed to stack_depot_save_flags(); see the comment next
+ * to its declaration for more details.
+ */
+#define STACK_DEPOT_FLAG_CAN_ALLOC ((depot_flags_t)0x0001)
+#define STACK_DEPOT_FLAG_GET ((depot_flags_t)0x0002)
+
+#define STACK_DEPOT_FLAGS_NUM 2
+#define STACK_DEPOT_FLAGS_MASK ((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1))
+
/*
* Using stack depot requires its initialization, which can be done in 3 ways:
*
@@ -69,31 +79,39 @@ static inline int stack_depot_early_init(void) { return 0; }
#endif
/**
- * __stack_depot_save - Save a stack trace to stack depot
+ * stack_depot_save_flags - Save a stack trace to stack depot
*
* @entries: Pointer to the stack trace
* @nr_entries: Number of frames in the stack
* @alloc_flags: Allocation GFP flags
- * @can_alloc: Allocate stack pools (increased chance of failure if false)
+ * @depot_flags: Stack depot flags
+ *
+ * Saves a stack trace from @entries array of size @nr_entries.
+ *
+ * If STACK_DEPOT_FLAG_CAN_ALLOC is set in @depot_flags, stack depot can
+ * replenish the stack pools in case no space is left (allocates using GFP
+ * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and
+ * fails if no space is left to store the stack trace.
*
- * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is
- * %true, stack depot can replenish the stack pools in case no space is left
- * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids
- * any allocations and fails if no space is left to store the stack trace.
+ * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment
+ * the refcount on the saved stack trace if it already exists in stack depot.
+ * Users of this flag must also call stack_depot_put() when keeping the stack
+ * trace is no longer required to avoid overflowing the refcount.
*
* If the provided stack trace comes from the interrupt context, only the part
* up to the interrupt entry is saved.
*
- * Context: Any context, but setting @can_alloc to %false is required if
+ * Context: Any context, but setting STACK_DEPOT_FLAG_CAN_ALLOC is required if
* alloc_pages() cannot be used from the current context. Currently
* this is the case for contexts where neither %GFP_ATOMIC nor
* %GFP_NOWAIT can be used (NMI, raw_spin_lock).
*
* Return: Handle of the stack struct stored in depot, 0 on failure
*/
-depot_stack_handle_t __stack_depot_save(unsigned long *entries,
- unsigned int nr_entries,
- gfp_t gfp_flags, bool can_alloc);
+depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
+ unsigned int nr_entries,
+ gfp_t gfp_flags,
+ depot_flags_t depot_flags);
/**
* stack_depot_save - Save a stack trace to stack depot
@@ -102,8 +120,11 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries,
* @nr_entries: Number of frames in the stack
* @alloc_flags: Allocation GFP flags
*
- * Context: Contexts where allocations via alloc_pages() are allowed.
- * See __stack_depot_save() for more details.
+ * Does not increment the refcount on the saved stack trace; see
+ * stack_depot_save_flags() for more details.
+ *
+ * Context: Contexts where allocations via alloc_pages() are allowed;
+ * see stack_depot_save_flags() for more details.
*
* Return: Handle of the stack trace stored in depot, 0 on failure
*/
@@ -142,6 +163,18 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
int spaces);
/**
+ * stack_depot_put - Drop a reference to a stack trace from stack depot
+ *
+ * @handle: Stack depot handle returned from stack_depot_save()
+ *
+ * The stack trace is evicted from stack depot once all references to it have
+ * been dropped (once the number of stack_depot_evict() calls matches the
+ * number of stack_depot_save_flags() calls with STACK_DEPOT_FLAG_GET set for
+ * this stack trace).
+ */
+void stack_depot_put(depot_stack_handle_t handle);
+
+/**
* stack_depot_set_extra_bits - Set extra bits in a stack depot handle
*
* @handle: Stack depot handle returned from stack_depot_save()
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f6dd6575b905..4db00ddad261 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -397,9 +397,6 @@ void folio_deactivate(struct folio *folio);
void folio_mark_lazyfree(struct folio *folio);
extern void swap_setup(void);
-extern void lru_cache_add_inactive_or_unevictable(struct page *page,
- struct vm_area_struct *vma);
-
/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
@@ -490,13 +487,12 @@ extern sector_t swapdev_block(int, pgoff_t);
extern int __swap_count(swp_entry_t entry);
extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
-extern struct swap_info_struct *page_swap_info(struct page *);
-extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
+struct swap_info_struct *swp_swap_info(swp_entry_t entry);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
-sector_t swap_page_sector(struct page *page);
+sector_t swap_folio_sector(struct folio *folio);
static inline void put_swap_device(struct swap_info_struct *si)
{
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f2dc19f40d05..e4056547fbe6 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -93,6 +93,17 @@ extern int mwriteprotect_range(struct mm_struct *dst_mm,
extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
+/* move_pages */
+void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
+void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
+ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
+ unsigned long dst_start, unsigned long src_start,
+ unsigned long len, __u64 flags);
+int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr);
+
/* mm helpers */
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx vm_ctx)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 8abfa1240040..747943bc8cc2 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -41,9 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
PGSTEAL_KHUGEPAGED,
- PGDEMOTE_KSWAPD,
- PGDEMOTE_DIRECT,
- PGDEMOTE_KHUGEPAGED,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
@@ -145,6 +142,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_ZSWAP
ZSWPIN,
ZSWPOUT,
+ ZSWPWB,
#endif
#ifdef CONFIG_X86
DIRECT_MAP_LEVEL2_SPLIT,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index fed855bae6d8..343906a98d6e 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -556,19 +556,25 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
local_irq_restore(flags);
}
-void __mod_lruvec_page_state(struct page *page,
+void __lruvec_stat_mod_folio(struct folio *folio,
enum node_stat_item idx, int val);
-static inline void mod_lruvec_page_state(struct page *page,
+static inline void lruvec_stat_mod_folio(struct folio *folio,
enum node_stat_item idx, int val)
{
unsigned long flags;
local_irq_save(flags);
- __mod_lruvec_page_state(page, idx, val);
+ __lruvec_stat_mod_folio(folio, idx, val);
local_irq_restore(flags);
}
+static inline void mod_lruvec_page_state(struct page *page,
+ enum node_stat_item idx, int val)
+{
+ lruvec_stat_mod_folio(page_folio(page), idx, val);
+}
+
#else
static inline void __mod_lruvec_state(struct lruvec *lruvec,
@@ -583,37 +589,25 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}
-static inline void __mod_lruvec_page_state(struct page *page,
- enum node_stat_item idx, int val)
-{
- __mod_node_page_state(page_pgdat(page), idx, val);
-}
-
-static inline void mod_lruvec_page_state(struct page *page,
+static inline void __lruvec_stat_mod_folio(struct folio *folio,
enum node_stat_item idx, int val)
{
- mod_node_page_state(page_pgdat(page), idx, val);
+ __mod_node_page_state(folio_pgdat(folio), idx, val);
}
-#endif /* CONFIG_MEMCG */
-
-static inline void __inc_lruvec_page_state(struct page *page,
- enum node_stat_item idx)
+static inline void lruvec_stat_mod_folio(struct folio *folio,
+ enum node_stat_item idx, int val)
{
- __mod_lruvec_page_state(page, idx, 1);
+ mod_node_page_state(folio_pgdat(folio), idx, val);
}
-static inline void __dec_lruvec_page_state(struct page *page,
- enum node_stat_item idx)
+static inline void mod_lruvec_page_state(struct page *page,
+ enum node_stat_item idx, int val)
{
- __mod_lruvec_page_state(page, idx, -1);
+ mod_node_page_state(page_pgdat(page), idx, val);
}
-static inline void __lruvec_stat_mod_folio(struct folio *folio,
- enum node_stat_item idx, int val)
-{
- __mod_lruvec_page_state(&folio->page, idx, val);
-}
+#endif /* CONFIG_MEMCG */
static inline void __lruvec_stat_add_folio(struct folio *folio,
enum node_stat_item idx)
@@ -627,24 +621,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio,
__lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
}
-static inline void inc_lruvec_page_state(struct page *page,
- enum node_stat_item idx)
-{
- mod_lruvec_page_state(page, idx, 1);
-}
-
-static inline void dec_lruvec_page_state(struct page *page,
- enum node_stat_item idx)
-{
- mod_lruvec_page_state(page, idx, -1);
-}
-
-static inline void lruvec_stat_mod_folio(struct folio *folio,
- enum node_stat_item idx, int val)
-{
- mod_lruvec_page_state(&folio->page, idx, val);
-}
-
static inline void lruvec_stat_add_folio(struct folio *folio,
enum node_stat_item idx)
{
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 2a60ce39cfde..0b709f5bc65f 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -5,19 +5,41 @@
#include <linux/types.h>
#include <linux/mm_types.h>
+struct lruvec;
+
extern u64 zswap_pool_total_size;
extern atomic_t zswap_stored_pages;
#ifdef CONFIG_ZSWAP
+struct zswap_lruvec_state {
+ /*
+ * Number of pages in zswap that should be protected from the shrinker.
+ * This number is an estimate of the following counts:
+ *
+ * a) Recent page faults.
+ * b) Recent insertion to the zswap LRU. This includes new zswap stores,
+ * as well as recent zswap LRU rotations.
+ *
+ * These pages are likely to be warm, and might incur IO if the are written
+ * to swap.
+ */
+ atomic_long_t nr_zswap_protected;
+};
+
bool zswap_store(struct folio *folio);
bool zswap_load(struct folio *folio);
void zswap_invalidate(int type, pgoff_t offset);
void zswap_swapon(int type);
void zswap_swapoff(int type);
-
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
+void zswap_lruvec_state_init(struct lruvec *lruvec);
+void zswap_folio_swapin(struct folio *folio);
+bool is_zswap_enabled(void);
#else
+struct zswap_lruvec_state {};
+
static inline bool zswap_store(struct folio *folio)
{
return false;
@@ -31,6 +53,14 @@ static inline bool zswap_load(struct folio *folio)
static inline void zswap_invalidate(int type, pgoff_t offset) {}
static inline void zswap_swapon(int type) {}
static inline void zswap_swapoff(int type) {}
+static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
+static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {}
+static inline void zswap_folio_swapin(struct folio *folio) {}
+
+static inline bool is_zswap_enabled(void)
+{
+ return false;
+}
#endif
diff --git a/include/trace/events/ksm.h b/include/trace/events/ksm.h
index b5ac35c1d0e8..e728647b5d26 100644
--- a/include/trace/events/ksm.h
+++ b/include/trace/events/ksm.h
@@ -245,6 +245,39 @@ TRACE_EVENT(ksm_remove_rmap_item,
__entry->pfn, __entry->rmap_item, __entry->mm)
);
+/**
+ * ksm_advisor - called after the advisor has run
+ *
+ * @scan_time: scan time in seconds
+ * @pages_to_scan: new pages_to_scan value
+ * @cpu_percent: cpu usage in percent
+ *
+ * Allows to trace the ksm advisor.
+ */
+TRACE_EVENT(ksm_advisor,
+
+ TP_PROTO(s64 scan_time, unsigned long pages_to_scan,
+ unsigned int cpu_percent),
+
+ TP_ARGS(scan_time, pages_to_scan, cpu_percent),
+
+ TP_STRUCT__entry(
+ __field(s64, scan_time)
+ __field(unsigned long, pages_to_scan)
+ __field(unsigned int, cpu_percent)
+ ),
+
+ TP_fast_assign(
+ __entry->scan_time = scan_time;
+ __entry->pages_to_scan = pages_to_scan;
+ __entry->cpu_percent = cpu_percent;
+ ),
+
+ TP_printk("ksm scan time %lld pages_to_scan %lu cpu percent %u",
+ __entry->scan_time, __entry->pages_to_scan,
+ __entry->cpu_percent)
+);
+
#endif /* _TRACE_KSM_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index da43810b7485..48ad69f7722e 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -316,6 +316,7 @@ typedef int __bitwise __kernel_rwf_t;
#define PAGE_IS_SWAPPED (1 << 4)
#define PAGE_IS_PFNZERO (1 << 5)
#define PAGE_IS_HUGE (1 << 6)
+#define PAGE_IS_SOFT_DIRTY (1 << 7)
/*
* struct page_region - Page region with flags
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 0dbc81015018..2841e4ea8f2c 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -41,7 +41,8 @@
UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \
UFFD_FEATURE_WP_UNPOPULATED | \
UFFD_FEATURE_POISON | \
- UFFD_FEATURE_WP_ASYNC)
+ UFFD_FEATURE_WP_ASYNC | \
+ UFFD_FEATURE_MOVE)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -50,6 +51,7 @@
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY | \
(__u64)1 << _UFFDIO_ZEROPAGE | \
+ (__u64)1 << _UFFDIO_MOVE | \
(__u64)1 << _UFFDIO_WRITEPROTECT | \
(__u64)1 << _UFFDIO_CONTINUE | \
(__u64)1 << _UFFDIO_POISON)
@@ -73,6 +75,7 @@
#define _UFFDIO_WAKE (0x02)
#define _UFFDIO_COPY (0x03)
#define _UFFDIO_ZEROPAGE (0x04)
+#define _UFFDIO_MOVE (0x05)
#define _UFFDIO_WRITEPROTECT (0x06)
#define _UFFDIO_CONTINUE (0x07)
#define _UFFDIO_POISON (0x08)
@@ -92,6 +95,8 @@
struct uffdio_copy)
#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
struct uffdio_zeropage)
+#define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \
+ struct uffdio_move)
#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
struct uffdio_writeprotect)
#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \
@@ -222,6 +227,9 @@ struct uffdio_api {
* asynchronous mode is supported in which the write fault is
* automatically resolved and write-protection is un-set.
* It implies UFFD_FEATURE_WP_UNPOPULATED.
+ *
+ * UFFD_FEATURE_MOVE indicates that the kernel supports moving an
+ * existing page contents from userspace.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -239,6 +247,7 @@ struct uffdio_api {
#define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
#define UFFD_FEATURE_POISON (1<<14)
#define UFFD_FEATURE_WP_ASYNC (1<<15)
+#define UFFD_FEATURE_MOVE (1<<16)
__u64 features;
__u64 ioctls;
@@ -347,6 +356,24 @@ struct uffdio_poison {
__s64 updated;
};
+struct uffdio_move {
+ __u64 dst;
+ __u64 src;
+ __u64 len;
+ /*
+ * Especially if used to atomically remove memory from the
+ * address space the wake on the dst range is not needed.
+ */
+#define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0)
+#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1)
+ __u64 mode;
+ /*
+ * "move" is written by the ioctl and must be at the end: the
+ * copy_from_user will not read the last 8 bytes.
+ */
+ __s64 move;
+};
+
/*
* Flags for the userfaultfd(2) system call itself.
*/