summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/cacheflush.h7
-rw-r--r--include/asm-generic/io.h31
-rw-r--r--include/asm-generic/iomap.h6
-rw-r--r--include/asm-generic/pgalloc.h88
-rw-r--r--include/asm-generic/tlb.h12
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/bio.h5
-rw-r--r--include/linux/buffer_head.h4
-rw-r--r--include/linux/cacheflush.h13
-rw-r--r--include/linux/damon.h28
-rw-r--r--include/linux/dax.h4
-rw-r--r--include/linux/frontswap.h91
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/highmem.h44
-rw-r--r--include/linux/huge_mm.h6
-rw-r--r--include/linux/hugetlb.h38
-rw-r--r--include/linux/ioremap.h30
-rw-r--r--include/linux/kfence.h11
-rw-r--r--include/linux/ksm.h20
-rw-r--r--include/linux/maple_tree.h46
-rw-r--r--include/linux/memblock.h14
-rw-r--r--include/linux/memcontrol.h18
-rw-r--r--include/linux/memory-tiers.h4
-rw-r--r--include/linux/memory.h8
-rw-r--r--include/linux/memory_hotplug.h3
-rw-r--r--include/linux/minmax.h27
-rw-r--r--include/linux/mm.h365
-rw-r--r--include/linux/mm_inline.h21
-rw-r--r--include/linux/mm_types.h135
-rw-r--r--include/linux/mm_types_task.h4
-rw-r--r--include/linux/mmap_lock.h18
-rw-r--r--include/linux/mmu_notifier.h104
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--include/linux/net_mm.h17
-rw-r--r--include/linux/page-flags.h90
-rw-r--r--include/linux/page_ext.h9
-rw-r--r--include/linux/page_idle.h5
-rw-r--r--include/linux/page_table_check.h71
-rw-r--r--include/linux/pagemap.h60
-rw-r--r--include/linux/pgtable.h123
-rw-r--r--include/linux/pid_namespace.h39
-rw-r--r--include/linux/rmap.h2
-rw-r--r--include/linux/secretmem.h15
-rw-r--r--include/linux/swap.h21
-rw-r--r--include/linux/swapfile.h5
-rw-r--r--include/linux/swapops.h15
-rw-r--r--include/linux/userfaultfd_k.h4
-rw-r--r--include/linux/zswap.h37
-rw-r--r--include/net/tcp.h1
-rw-r--r--include/trace/events/thp.h33
-rw-r--r--include/uapi/linux/userfaultfd.h25
51 files changed, 1064 insertions, 717 deletions
diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h
index f46258d1a080..84ec53ccc450 100644
--- a/include/asm-generic/cacheflush.h
+++ b/include/asm-generic/cacheflush.h
@@ -77,13 +77,6 @@ static inline void flush_icache_range(unsigned long start, unsigned long end)
#define flush_icache_user_range flush_icache_range
#endif
-#ifndef flush_icache_page
-static inline void flush_icache_page(struct vm_area_struct *vma,
- struct page *page)
-{
-}
-#endif
-
#ifndef flush_icache_user_page
static inline void flush_icache_user_page(struct vm_area_struct *vma,
struct page *page,
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 587e7e9b9a37..bac63e874c7b 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -1047,41 +1047,22 @@ static inline void iounmap(volatile void __iomem *addr)
#elif defined(CONFIG_GENERIC_IOREMAP)
#include <linux/pgtable.h>
-/*
- * Arch code can implement the following two hooks when using GENERIC_IOREMAP
- * ioremap_allowed() return a bool,
- * - true means continue to remap
- * - false means skip remap and return directly
- * iounmap_allowed() return a bool,
- * - true means continue to vunmap
- * - false means skip vunmap and return directly
- */
-#ifndef ioremap_allowed
-#define ioremap_allowed ioremap_allowed
-static inline bool ioremap_allowed(phys_addr_t phys_addr, size_t size,
- unsigned long prot)
-{
- return true;
-}
-#endif
-
-#ifndef iounmap_allowed
-#define iounmap_allowed iounmap_allowed
-static inline bool iounmap_allowed(void *addr)
-{
- return true;
-}
-#endif
+void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size,
+ pgprot_t prot);
void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
unsigned long prot);
void iounmap(volatile void __iomem *addr);
+void generic_iounmap(volatile void __iomem *addr);
+#ifndef ioremap
+#define ioremap ioremap
static inline void __iomem *ioremap(phys_addr_t addr, size_t size)
{
/* _PAGE_IOREMAP needs to be supplied by the architecture */
return ioremap_prot(addr, size, _PAGE_IOREMAP);
}
+#endif
#endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */
#ifndef ioremap_wc
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 08237ae8b840..196087a8126e 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -93,15 +93,15 @@ extern void __iomem *ioport_map(unsigned long port, unsigned int nr);
extern void ioport_unmap(void __iomem *);
#endif
-#ifndef ARCH_HAS_IOREMAP_WC
+#ifndef ioremap_wc
#define ioremap_wc ioremap
#endif
-#ifndef ARCH_HAS_IOREMAP_WT
+#ifndef ioremap_wt
#define ioremap_wt ioremap
#endif
-#ifndef ARCH_HAS_IOREMAP_NP
+#ifndef ioremap_np
/* See the comment in asm-generic/io.h about ioremap_np(). */
#define ioremap_np ioremap_np
static inline void __iomem *ioremap_np(phys_addr_t offset, size_t size)
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index a7cf825befae..c75d4a753849 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -8,7 +8,7 @@
#define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)
/**
- * __pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
+ * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
* @mm: the mm_struct of the current context
*
* This function is intended for architectures that need
@@ -18,12 +18,17 @@
*/
static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
{
- return (pte_t *)__get_free_page(GFP_PGTABLE_KERNEL);
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL &
+ ~__GFP_HIGHMEM, 0);
+
+ if (!ptdesc)
+ return NULL;
+ return ptdesc_address(ptdesc);
}
#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
/**
- * pte_alloc_one_kernel - allocate a page for PTE-level kernel page table
+ * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
* @mm: the mm_struct of the current context
*
* Return: pointer to the allocated memory or %NULL on error
@@ -35,40 +40,40 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
#endif
/**
- * pte_free_kernel - free PTE-level kernel page table page
+ * pte_free_kernel - free PTE-level kernel page table memory
* @mm: the mm_struct of the current context
* @pte: pointer to the memory containing the page table
*/
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
- free_page((unsigned long)pte);
+ pagetable_free(virt_to_ptdesc(pte));
}
/**
- * __pte_alloc_one - allocate a page for PTE-level user page table
+ * __pte_alloc_one - allocate memory for a PTE-level user page table
* @mm: the mm_struct of the current context
* @gfp: GFP flags to use for the allocation
*
- * Allocates a page and runs the pgtable_pte_page_ctor().
+ * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
*
* This function is intended for architectures that need
* anything beyond simple page allocation or must have custom GFP flags.
*
- * Return: `struct page` initialized as page table or %NULL on error
+ * Return: `struct page` referencing the ptdesc or %NULL on error
*/
static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
{
- struct page *pte;
+ struct ptdesc *ptdesc;
- pte = alloc_page(gfp);
- if (!pte)
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pte_page_ctor(pte)) {
- __free_page(pte);
+ if (!pagetable_pte_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- return pte;
+ return ptdesc_page(ptdesc);
}
#ifndef __HAVE_ARCH_PTE_ALLOC_ONE
@@ -76,9 +81,9 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
* pte_alloc_one - allocate a page for PTE-level user page table
* @mm: the mm_struct of the current context
*
- * Allocates a page and runs the pgtable_pte_page_ctor().
+ * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
*
- * Return: `struct page` initialized as page table or %NULL on error
+ * Return: `struct page` referencing the ptdesc or %NULL on error
*/
static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
{
@@ -92,14 +97,16 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
*/
/**
- * pte_free - free PTE-level user page table page
+ * pte_free - free PTE-level user page table memory
* @mm: the mm_struct of the current context
- * @pte_page: the `struct page` representing the page table
+ * @pte_page: the `struct page` referencing the ptdesc
*/
static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
{
- pgtable_pte_page_dtor(pte_page);
- __free_page(pte_page);
+ struct ptdesc *ptdesc = page_ptdesc(pte_page);
+
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
@@ -107,10 +114,11 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
/**
- * pmd_alloc_one - allocate a page for PMD-level page table
+ * pmd_alloc_one - allocate memory for a PMD-level page table
* @mm: the mm_struct of the current context
*
- * Allocates a page and runs the pgtable_pmd_page_ctor().
+ * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor().
+ *
* Allocations use %GFP_PGTABLE_USER in user context and
* %GFP_PGTABLE_KERNEL in kernel context.
*
@@ -118,28 +126,30 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
*/
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
- struct page *page;
+ struct ptdesc *ptdesc;
gfp_t gfp = GFP_PGTABLE_USER;
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
- page = alloc_page(gfp);
- if (!page)
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pmd_page_ctor(page)) {
- __free_page(page);
+ if (!pagetable_pmd_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- return (pmd_t *)page_address(page);
+ return ptdesc_address(ptdesc);
}
#endif
#ifndef __HAVE_ARCH_PMD_FREE
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
+
BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
- pgtable_pmd_page_dtor(virt_to_page(pmd));
- free_page((unsigned long)pmd);
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
}
#endif
@@ -150,19 +160,25 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
gfp_t gfp = GFP_PGTABLE_USER;
+ struct ptdesc *ptdesc;
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
- return (pud_t *)get_zeroed_page(gfp);
+ gfp &= ~__GFP_HIGHMEM;
+
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
+ return NULL;
+ return ptdesc_address(ptdesc);
}
#ifndef __HAVE_ARCH_PUD_ALLOC_ONE
/**
- * pud_alloc_one - allocate a page for PUD-level page table
+ * pud_alloc_one - allocate memory for a PUD-level page table
* @mm: the mm_struct of the current context
*
- * Allocates a page using %GFP_PGTABLE_USER for user context and
- * %GFP_PGTABLE_KERNEL for kernel context.
+ * Allocate memory for a page table using %GFP_PGTABLE_USER for user context
+ * and %GFP_PGTABLE_KERNEL for kernel context.
*
* Return: pointer to the allocated memory or %NULL on error
*/
@@ -175,7 +191,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
{
BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
- free_page((unsigned long)pud);
+ pagetable_free(virt_to_ptdesc(pud));
}
#ifndef __HAVE_ARCH_PUD_FREE
@@ -190,7 +206,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
#ifndef __HAVE_ARCH_PGD_FREE
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
- free_page((unsigned long)pgd);
+ pagetable_free(virt_to_ptdesc(pgd));
}
#endif
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b46617207c93..129a3a759976 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -456,7 +456,6 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
return;
tlb_flush(tlb);
- mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
__tlb_reset_range(tlb);
}
@@ -481,6 +480,17 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
return tlb_remove_page_size(tlb, page, PAGE_SIZE);
}
+static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt)
+{
+ tlb_remove_table(tlb, pt);
+}
+
+/* Like tlb_remove_ptdesc, but for page-like page directories. */
+static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
+{
+ tlb_remove_page(tlb, ptdesc_page(pt));
+}
+
static inline void tlb_change_page_size(struct mmu_gather *tlb,
unsigned int page_size)
{
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fbad4fcd408e..1a97277f99b1 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -46,7 +46,6 @@ extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
extern struct workqueue_struct *bdi_wq;
-extern struct workqueue_struct *bdi_async_bio_wq;
static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 11984ed29cb8..e8767f165bad 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -253,6 +253,11 @@ static inline struct page *bio_first_page_all(struct bio *bio)
return bio_first_bvec_all(bio)->bv_page;
}
+static inline struct folio *bio_first_folio_all(struct bio *bio)
+{
+ return page_folio(bio_first_page_all(bio));
+}
+
static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
{
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 6cb3e9af78c9..06566aee94ca 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -194,8 +194,6 @@ void buffer_check_dirty_writeback(struct folio *folio,
void mark_buffer_dirty(struct buffer_head *bh);
void mark_buffer_write_io_error(struct buffer_head *bh);
void touch_buffer(struct buffer_head *bh);
-void set_bh_page(struct buffer_head *bh,
- struct page *page, unsigned long offset);
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
unsigned long offset);
bool try_to_free_buffers(struct folio *);
@@ -288,7 +286,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
unsigned, struct page **, void **,
get_block_t *, loff_t *);
int generic_cont_expand_simple(struct inode *inode, loff_t size);
-int block_commit_write(struct page *page, unsigned from, unsigned to);
+void block_commit_write(struct page *page, unsigned int from, unsigned int to);
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block);
/* Convert errno to return value from ->page_mkwrite() call */
diff --git a/include/linux/cacheflush.h b/include/linux/cacheflush.h
index a6189d21f2ba..55f297b2c23f 100644
--- a/include/linux/cacheflush.h
+++ b/include/linux/cacheflush.h
@@ -7,14 +7,23 @@
struct folio;
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
+#ifndef flush_dcache_folio
void flush_dcache_folio(struct folio *folio);
#endif
#else
static inline void flush_dcache_folio(struct folio *folio)
{
}
-#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO 0
+#define flush_dcache_folio flush_dcache_folio
#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */
+#ifndef flush_icache_pages
+static inline void flush_icache_pages(struct vm_area_struct *vma,
+ struct page *page, unsigned int nr)
+{
+}
+#endif
+
+#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1)
+
#endif /* _LINUX_CACHEFLUSH_H */
diff --git a/include/linux/damon.h b/include/linux/damon.h
index d5d4d19928e0..ae2664d1d5f1 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -226,16 +226,26 @@ struct damos_stat {
* enum damos_filter_type - Type of memory for &struct damos_filter
* @DAMOS_FILTER_TYPE_ANON: Anonymous pages.
* @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages.
+ * @DAMOS_FILTER_TYPE_ADDR: Address range.
+ * @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target.
* @NR_DAMOS_FILTER_TYPES: Number of filter types.
*
- * The support of each filter type is up to running &struct damon_operations.
- * &enum DAMON_OPS_PADDR is supporting all filter types, while
- * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any
- * filter types.
+ * The anon pages type and memcg type filters are handled by underlying
+ * &struct damon_operations as a part of scheme action trying, and therefore
+ * accounted as 'tried'. In contrast, other types are handled by core layer
+ * before trying of the action and therefore not accounted as 'tried'.
+ *
+ * The support of the filters that handled by &struct damon_operations depend
+ * on the running &struct damon_operations.
+ * &enum DAMON_OPS_PADDR supports both anon pages type and memcg type filters,
+ * while &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR don't support any of
+ * the two types.
*/
enum damos_filter_type {
DAMOS_FILTER_TYPE_ANON,
DAMOS_FILTER_TYPE_MEMCG,
+ DAMOS_FILTER_TYPE_ADDR,
+ DAMOS_FILTER_TYPE_TARGET,
NR_DAMOS_FILTER_TYPES,
};
@@ -244,18 +254,24 @@ enum damos_filter_type {
* @type: Type of the page.
* @matching: If the matching page should filtered out or in.
* @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG.
+ * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR.
+ * @target_idx: Index of the &struct damon_target of
+ * &damon_ctx->adaptive_targets if @type is
+ * DAMOS_FILTER_TYPE_TARGET.
* @list: List head for siblings.
*
* Before applying the &damos->action to a memory region, DAMOS checks if each
* page of the region matches to this and avoid applying the action if so.
- * Note that the check support is up to &struct damon_operations
- * implementation.
+ * Support of each filter type depends on the running &struct damon_operations
+ * and the type. Refer to &enum damos_filter_type for more detai.
*/
struct damos_filter {
enum damos_filter_type type;
bool matching;
union {
unsigned short memcg_id;
+ struct damon_addr_range addr_range;
+ int target_idx;
};
struct list_head list;
};
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 261944ec0887..22cd9902345d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -241,10 +241,10 @@ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops);
-vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size, pfn_t pfn);
+ unsigned int order, pfn_t pfn);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
deleted file mode 100644
index eaa0ac5f9003..000000000000
--- a/include/linux/frontswap.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_FRONTSWAP_H
-#define _LINUX_FRONTSWAP_H
-
-#include <linux/swap.h>
-#include <linux/mm.h>
-#include <linux/bitops.h>
-#include <linux/jump_label.h>
-
-struct frontswap_ops {
- void (*init)(unsigned); /* this swap type was just swapon'ed */
- int (*store)(unsigned, pgoff_t, struct page *); /* store a page */
- int (*load)(unsigned, pgoff_t, struct page *, bool *); /* load a page */
- void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */
- void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */
-};
-
-int frontswap_register_ops(const struct frontswap_ops *ops);
-
-extern void frontswap_init(unsigned type, unsigned long *map);
-extern int __frontswap_store(struct page *page);
-extern int __frontswap_load(struct page *page);
-extern void __frontswap_invalidate_page(unsigned, pgoff_t);
-extern void __frontswap_invalidate_area(unsigned);
-
-#ifdef CONFIG_FRONTSWAP
-extern struct static_key_false frontswap_enabled_key;
-
-static inline bool frontswap_enabled(void)
-{
- return static_branch_unlikely(&frontswap_enabled_key);
-}
-
-static inline void frontswap_map_set(struct swap_info_struct *p,
- unsigned long *map)
-{
- p->frontswap_map = map;
-}
-
-static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
-{
- return p->frontswap_map;
-}
-#else
-/* all inline routines become no-ops and all externs are ignored */
-
-static inline bool frontswap_enabled(void)
-{
- return false;
-}
-
-static inline void frontswap_map_set(struct swap_info_struct *p,
- unsigned long *map)
-{
-}
-
-static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
-{
- return NULL;
-}
-#endif
-
-static inline int frontswap_store(struct page *page)
-{
- if (frontswap_enabled())
- return __frontswap_store(page);
-
- return -1;
-}
-
-static inline int frontswap_load(struct page *page)
-{
- if (frontswap_enabled())
- return __frontswap_load(page);
-
- return -1;
-}
-
-static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset)
-{
- if (frontswap_enabled())
- __frontswap_invalidate_page(type, offset);
-}
-
-static inline void frontswap_invalidate_area(unsigned type)
-{
- if (frontswap_enabled())
- __frontswap_invalidate_area(type);
-}
-
-#endif /* _LINUX_FRONTSWAP_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dda08d973639..c8ff4156a0a1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -478,11 +478,11 @@ struct address_space {
atomic_t nr_thps;
#endif
struct rb_root_cached i_mmap;
- struct rw_semaphore i_mmap_rwsem;
unsigned long nrpages;
pgoff_t writeback_index;
const struct address_space_operations *a_ops;
unsigned long flags;
+ struct rw_semaphore i_mmap_rwsem;
errseq_t wb_err;
spinlock_t private_lock;
struct list_head private_list;
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 68da30625a6c..99c474de800d 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -439,6 +439,50 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len)
kunmap_local(addr);
}
+static inline void memcpy_from_folio(char *to, struct folio *folio,
+ size_t offset, size_t len)
+{
+ VM_BUG_ON(offset + len > folio_size(folio));
+
+ do {
+ const char *from = kmap_local_folio(folio, offset);
+ size_t chunk = len;
+
+ if (folio_test_highmem(folio) &&
+ chunk > PAGE_SIZE - offset_in_page(offset))
+ chunk = PAGE_SIZE - offset_in_page(offset);
+ memcpy(to, from, chunk);
+ kunmap_local(from);
+
+ from += chunk;
+ offset += chunk;
+ len -= chunk;
+ } while (len > 0);
+}
+
+static inline void memcpy_to_folio(struct folio *folio, size_t offset,
+ const char *from, size_t len)
+{
+ VM_BUG_ON(offset + len > folio_size(folio));
+
+ do {
+ char *to = kmap_local_folio(folio, offset);
+ size_t chunk = len;
+
+ if (folio_test_highmem(folio) &&
+ chunk > PAGE_SIZE - offset_in_page(offset))
+ chunk = PAGE_SIZE - offset_in_page(offset);
+ memcpy(to, from, chunk);
+ kunmap_local(to);
+
+ from += chunk;
+ offset += chunk;
+ len -= chunk;
+ } while (len > 0);
+
+ flush_dcache_folio(folio);
+}
+
/**
* memcpy_from_file_folio - Copy some bytes from a file folio.
* @to: The destination buffer.
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e718dbe928ba..fa0350b0812a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -140,9 +140,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
-void prep_transhuge_page(struct page *page);
-void free_transhuge_page(struct page *page);
-
+void folio_prep_large_rmappable(struct folio *folio);
bool can_split_folio(struct folio *folio, int *pextra_pins);
int split_huge_page_to_list(struct page *page, struct list_head *list);
static inline int split_huge_page(struct page *page)
@@ -282,7 +280,7 @@ static inline bool hugepage_vma_check(struct vm_area_struct *vma,
return false;
}
-static inline void prep_transhuge_page(struct page *page) {}
+static inline void folio_prep_large_rmappable(struct folio *folio) {}
#define transparent_hugepage_flags 0UL
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ca3c8e10f24a..5b2626063f4f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -26,6 +26,8 @@ typedef struct { unsigned long pd; } hugepd_t;
#define __hugepd(x) ((hugepd_t) { (x) })
#endif
+void free_huge_folio(struct folio *folio);
+
#ifdef CONFIG_HUGETLB_PAGE
#include <linux/mempolicy.h>
@@ -131,10 +133,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
struct vm_area_struct *, struct vm_area_struct *);
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags);
-long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
- struct page **, unsigned long *, unsigned long *,
- long, unsigned int, int *);
+ unsigned long address, unsigned int flags,
+ unsigned int *page_mask);
void unmap_hugepage_range(struct vm_area_struct *,
unsigned long, unsigned long, struct page *,
zap_flags_t);
@@ -167,7 +167,6 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared);
void folio_putback_active_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
-void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
@@ -297,21 +296,13 @@ static inline void adjust_range_if_pmd_sharing_possible(
{
}
-static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags)
+static inline struct page *hugetlb_follow_page_mask(
+ struct vm_area_struct *vma, unsigned long address, unsigned int flags,
+ unsigned int *page_mask)
{
BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
}
-static inline long follow_hugetlb_page(struct mm_struct *mm,
- struct vm_area_struct *vma, struct page **pages,
- unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *nonblocking)
-{
- BUG();
- return 0;
-}
-
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
struct mm_struct *src,
struct vm_area_struct *dst_vma,
@@ -851,11 +842,6 @@ static inline struct hstate *folio_hstate(struct folio *folio)
return size_to_hstate(folio_size(folio));
}
-static inline struct hstate *page_hstate(struct page *page)
-{
- return folio_hstate(page_folio(page));
-}
-
static inline unsigned hstate_index_to_shift(unsigned index)
{
return hstates[index].order + PAGE_SHIFT;
@@ -1007,6 +993,11 @@ void hugetlb_register_node(struct node *node);
void hugetlb_unregister_node(struct node *node);
#endif
+/*
+ * Check if a given raw @page in a hugepage is HWPOISON.
+ */
+bool is_raw_hwpoison_page_in_hugepage(struct page *page);
+
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
@@ -1067,11 +1058,6 @@ static inline struct hstate *folio_hstate(struct folio *folio)
return NULL;
}
-static inline struct hstate *page_hstate(struct page *page)
-{
- return NULL;
-}
-
static inline struct hstate *size_to_hstate(unsigned long size)
{
return NULL;
diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h
new file mode 100644
index 000000000000..f0e99fc7dd8b
--- /dev/null
+++ b/include/linux/ioremap.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IOREMAP_H
+#define _LINUX_IOREMAP_H
+
+#include <linux/kasan.h>
+#include <asm/pgtable.h>
+
+#if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP)
+/*
+ * Ioremap often, but not always uses the generic vmalloc area. E.g on
+ * Power ARCH, it could have different ioremap space.
+ */
+#ifndef IOREMAP_START
+#define IOREMAP_START VMALLOC_START
+#define IOREMAP_END VMALLOC_END
+#endif
+static inline bool is_ioremap_addr(const void *x)
+{
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
+
+ return addr >= IOREMAP_START && addr < IOREMAP_END;
+}
+#else
+static inline bool is_ioremap_addr(const void *x)
+{
+ return false;
+}
+#endif
+
+#endif /* _LINUX_IOREMAP_H */
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 726857a4b680..401af4757514 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -59,15 +59,16 @@ static __always_inline bool is_kfence_address(const void *addr)
}
/**
- * kfence_alloc_pool() - allocate the KFENCE pool via memblock
+ * kfence_alloc_pool_and_metadata() - allocate the KFENCE pool and KFENCE
+ * metadata via memblock
*/
-void __init kfence_alloc_pool(void);
+void __init kfence_alloc_pool_and_metadata(void);
/**
* kfence_init() - perform KFENCE initialization at boot time
*
- * Requires that kfence_alloc_pool() was called before. This sets up the
- * allocation gate timer, and requires that workqueues are available.
+ * Requires that kfence_alloc_pool_and_metadata() was called before. This sets
+ * up the allocation gate timer, and requires that workqueues are available.
*/
void __init kfence_init(void);
@@ -223,7 +224,7 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla
#else /* CONFIG_KFENCE */
static inline bool is_kfence_address(const void *addr) { return false; }
-static inline void kfence_alloc_pool(void) { }
+static inline void kfence_alloc_pool_and_metadata(void) { }
static inline void kfence_init(void) { }
static inline void kfence_shutdown_cache(struct kmem_cache *s) { }
static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; }
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 899a314bc487..c2dd786a30e1 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -26,6 +26,22 @@ int ksm_disable(struct mm_struct *mm);
int __ksm_enter(struct mm_struct *mm);
void __ksm_exit(struct mm_struct *mm);
+/*
+ * To identify zeropages that were mapped by KSM, we reuse the dirty bit
+ * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when
+ * deduplicating memory.
+ */
+#define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte))
+
+extern unsigned long ksm_zero_pages;
+
+static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
+{
+ if (is_ksm_zero_pte(pte)) {
+ ksm_zero_pages--;
+ mm->ksm_zero_pages--;
+ }
+}
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
@@ -95,6 +111,10 @@ static inline void ksm_exit(struct mm_struct *mm)
{
}
+static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
+{
+}
+
#ifdef CONFIG_MEMORY_FAILURE
static inline void collect_procs_ksm(struct page *page,
struct list_head *to_kill, int force_early)
diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 295548cca8b3..e41c70ac7744 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -29,14 +29,12 @@
#define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */
-#define MAPLE_ARANGE64_META_MAX 15 /* Out of range for metadata */
#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1)
#else
/* 32bit sizes */
#define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */
-#define MAPLE_ARANGE64_META_MAX 31 /* Out of range for metadata */
#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2)
#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */
@@ -184,13 +182,23 @@ enum maple_type {
#ifdef CONFIG_LOCKDEP
typedef struct lockdep_map *lockdep_map_p;
-#define mt_lock_is_held(mt) lock_is_held(mt->ma_external_lock)
+#define mt_lock_is_held(mt) \
+ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock))
+
+#define mt_write_lock_is_held(mt) \
+ (!(mt)->ma_external_lock || \
+ lock_is_held_type((mt)->ma_external_lock, 0))
+
#define mt_set_external_lock(mt, lock) \
(mt)->ma_external_lock = &(lock)->dep_map
+
+#define mt_on_stack(mt) (mt).ma_external_lock = NULL
#else
typedef struct { /* nothing */ } lockdep_map_p;
-#define mt_lock_is_held(mt) 1
+#define mt_lock_is_held(mt) 1
+#define mt_write_lock_is_held(mt) 1
#define mt_set_external_lock(mt, lock) do { } while (0)
+#define mt_on_stack(mt) do { } while (0)
#endif
/*
@@ -212,8 +220,8 @@ struct maple_tree {
spinlock_t ma_lock;
lockdep_map_p ma_external_lock;
};
- void __rcu *ma_root;
unsigned int ma_flags;
+ void __rcu *ma_root;
};
/**
@@ -458,7 +466,7 @@ void *mas_find(struct ma_state *mas, unsigned long max);
void *mas_find_range(struct ma_state *mas, unsigned long max);
void *mas_find_rev(struct ma_state *mas, unsigned long min);
void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
-int mas_preallocate(struct ma_state *mas, gfp_t gfp);
+int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
bool mas_is_err(struct ma_state *mas);
bool mas_nomem(struct ma_state *mas, gfp_t gfp);
@@ -531,6 +539,22 @@ static inline void mas_reset(struct ma_state *mas)
*/
#define mas_for_each(__mas, __entry, __max) \
while (((__entry) = mas_find((__mas), (__max))) != NULL)
+/**
+ * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
+ * current location.
+ * @mas: Maple Tree operation state.
+ * @start: New start of range in the Maple Tree.
+ * @last: New end of range in the Maple Tree.
+ *
+ * set the internal maple state values to a sub-range.
+ * Please use mas_set_range() if you do not know where you are in the tree.
+ */
+static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
+ unsigned long last)
+{
+ mas->index = start;
+ mas->last = last;
+}
/**
* mas_set_range() - Set up Maple Tree operation state for a different index.
@@ -545,9 +569,8 @@ static inline void mas_reset(struct ma_state *mas)
static inline
void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
{
- mas->index = start;
- mas->last = last;
- mas->node = MAS_START;
+ __mas_set_range(mas, start, last);
+ mas->node = MAS_START;
}
/**
@@ -662,10 +685,11 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);
* mt_for_each - Iterate over each entry starting at index until max.
* @__tree: The Maple Tree
* @__entry: The current entry
- * @__index: The index to update to track the location in the tree
+ * @__index: The index to start the search from. Subsequently used as iterator.
* @__max: The maximum limit for @index
*
- * Note: Will not return the zero entry.
+ * This iterator skips all entries, which resolve to a NULL pointer,
+ * e.g. entries which has been reserved with XA_ZERO_ENTRY.
*/
#define mt_for_each(__tree, __entry, __index, __max) \
for (__entry = mt_find(__tree, &(__index), __max); \
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f71ff9f0ec81..1c1072e3ca06 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -581,9 +581,7 @@ extern void *alloc_large_system_hash(const char *tablename,
unsigned long high_limit);
#define HASH_EARLY 0x00000001 /* Allocating during early boot? */
-#define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min
- * shift passed via *_hash_shift */
-#define HASH_ZERO 0x00000004 /* Zero allocated hash table */
+#define HASH_ZERO 0x00000002 /* Zero allocated hash table */
/* Only NUMA needs hash distribution. 64bit NUMA architectures have
* sufficient vmalloc space.
@@ -596,13 +594,11 @@ extern int hashdist; /* Distribute hashes across NUMA nodes? */
#endif
#ifdef CONFIG_MEMTEST
-extern phys_addr_t early_memtest_bad_size; /* Size of faulty ram found by memtest */
-extern bool early_memtest_done; /* Was early memtest done? */
-extern void early_memtest(phys_addr_t start, phys_addr_t end);
+void early_memtest(phys_addr_t start, phys_addr_t end);
+void memtest_report_meminfo(struct seq_file *m);
#else
-static inline void early_memtest(phys_addr_t start, phys_addr_t end)
-{
-}
+static inline void early_memtest(phys_addr_t start, phys_addr_t end) { }
+static inline void memtest_report_meminfo(struct seq_file *m) { }
#endif
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dbf26bc89dd4..ab94ad4597d0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -61,7 +61,6 @@ struct mem_cgroup_reclaim_cookie {
#ifdef CONFIG_MEMCG
#define MEM_CGROUP_ID_SHIFT 16
-#define MEM_CGROUP_ID_MAX USHRT_MAX
struct mem_cgroup_id {
int id;
@@ -112,6 +111,9 @@ struct lruvec_stats {
/* Aggregated (CPU and subtree) state */
long state[NR_VM_NODE_STAT_ITEMS];
+ /* Non-hierarchical (CPU aggregated) state */
+ long state_local[NR_VM_NODE_STAT_ITEMS];
+
/* Pending child counts during tree propagation */
long state_pending[NR_VM_NODE_STAT_ITEMS];
};
@@ -588,7 +590,7 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root,
/*
* There is no reclaim protection applied to a targeted reclaim.
* We are special casing this specific case here because
- * mem_cgroup_protected calculation is not robust enough to keep
+ * mem_cgroup_calculate_protection is not robust enough to keep
* the protection invariant for calculated effective values for
* parallel reclaimers with different reclaim target. This is
* especially a problem for tail memcgs (as they have pages on LRU)
@@ -866,8 +868,7 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
* parent_mem_cgroup - find the accounting parent of a memcg
* @memcg: memcg whose parent to find
*
- * Returns the parent memcg, or NULL if this is the root or the memory
- * controller is in legacy no-hierarchy mode.
+ * Returns the parent memcg, or NULL if this is the root.
*/
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
@@ -1025,14 +1026,12 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
{
struct mem_cgroup_per_node *pn;
long x = 0;
- int cpu;
if (mem_cgroup_disabled())
return node_page_state(lruvec_pgdat(lruvec), idx);
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- for_each_possible_cpu(cpu)
- x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
+ x = READ_ONCE(pn->lruvec_stats.state_local[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -1163,7 +1162,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
-#define MEM_CGROUP_ID_MAX 0
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
@@ -1766,7 +1764,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);
struct obj_cgroup *get_obj_cgroup_from_current(void);
-struct obj_cgroup *get_obj_cgroup_from_page(struct page *page);
+struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);
int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
@@ -1850,7 +1848,7 @@ static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
{
}
-static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
+static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
return NULL;
}
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index fc9647b1b4f9..437441cdf78f 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -33,7 +33,7 @@ struct memory_dev_type {
#ifdef CONFIG_NUMA
extern bool numa_demotion_enabled;
struct memory_dev_type *alloc_memory_type(int adistance);
-void destroy_memory_type(struct memory_dev_type *memtype);
+void put_memory_type(struct memory_dev_type *memtype);
void init_node_memory_type(int node, struct memory_dev_type *default_type);
void clear_node_memory_type(int node, struct memory_dev_type *memtype);
#ifdef CONFIG_MIGRATION
@@ -68,7 +68,7 @@ static inline struct memory_dev_type *alloc_memory_type(int adistance)
return NULL;
}
-static inline void destroy_memory_type(struct memory_dev_type *memtype)
+static inline void put_memory_type(struct memory_dev_type *memtype)
{
}
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 31343566c221..f53cfdaaaa41 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -77,11 +77,7 @@ struct memory_block {
*/
struct zone *zone;
struct device dev;
- /*
- * Number of vmemmap pages. These pages
- * lay at the beginning of the memory block.
- */
- unsigned long nr_vmemmap_pages;
+ struct vmem_altmap *altmap;
struct memory_group *group; /* group (if any) for this block */
struct list_head group_next; /* next block inside memory group */
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
@@ -147,7 +143,7 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
int create_memory_block_devices(unsigned long start, unsigned long size,
- unsigned long vmemmap_pages,
+ struct vmem_altmap *altmap,
struct memory_group *group);
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 013c69753c91..7d2076583494 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -97,6 +97,8 @@ typedef int __bitwise mhp_t;
* To do so, we will use the beginning of the hot-added range to build
* the page tables for the memmap array that describes the entire range.
* Only selected architectures support it with SPARSE_VMEMMAP.
+ * This is only a hint, the core kernel can decide to not do this based on
+ * different alignment checks.
*/
#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1))
/*
@@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid,
extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
struct mhp_params *params);
void arch_remove_linear_mapping(u64 start, u64 size);
-extern bool mhp_supports_memmap_on_memory(unsigned long size);
#endif /* CONFIG_MEMORY_HOTPLUG */
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 396df1121bff..4f011eb6533d 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -3,6 +3,7 @@
#define _LINUX_MINMAX_H
#include <linux/const.h>
+#include <linux/types.h>
/*
* min()/max()/clamp() macros must accomplish three things:
@@ -158,6 +159,32 @@
*/
#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
+static inline bool in_range64(u64 val, u64 start, u64 len)
+{
+ return (val - start) < len;
+}
+
+static inline bool in_range32(u32 val, u32 start, u32 len)
+{
+ return (val - start) < len;
+}
+
+/**
+ * in_range - Determine if a value lies within a range.
+ * @val: Value to test.
+ * @start: First value in range.
+ * @len: Number of values in range.
+ *
+ * This is more efficient than "if (start <= val && val < (start + len))".
+ * It also gives a different answer if @start + @len overflows the size of
+ * the type by a sufficient amount to encompass @val. Decide for yourself
+ * which behaviour you want, or prove that start + len never overflow.
+ * Do not blindly replace one form with the other.
+ */
+#define in_range(val, start, len) \
+ ((sizeof(start) | sizeof(len) | sizeof(val)) <= sizeof(u32) ? \
+ in_range32(val, start, len) : in_range64(val, start, len))
+
/**
* swap - swap values of @a and @b
* @a: first value
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 34f9dba17c1a..53efddc4d178 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -532,13 +532,6 @@ struct vm_fault {
*/
};
-/* page entry size for vm->huge_fault() */
-enum page_entry_size {
- PE_SIZE_PTE = 0,
- PE_SIZE_PMD,
- PE_SIZE_PUD,
-};
-
/*
* These are the virtual MM functions - opening of an area, closing and
* unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -562,8 +555,7 @@ struct vm_operations_struct {
int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
unsigned long end, unsigned long newflags);
vm_fault_t (*fault)(struct vm_fault *vmf);
- vm_fault_t (*huge_fault)(struct vm_fault *vmf,
- enum page_entry_size pe_size);
+ vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
vm_fault_t (*map_pages)(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff);
unsigned long (*pagesize)(struct vm_area_struct * area);
@@ -679,6 +671,7 @@ static inline void vma_end_read(struct vm_area_struct *vma)
rcu_read_unlock();
}
+/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
{
mmap_assert_write_locked(vma->vm_mm);
@@ -691,6 +684,11 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
return (vma->vm_lock_seq == *mm_lock_seq);
}
+/*
+ * Begin writing to a VMA.
+ * Exclude concurrent readers under the per-VMA lock until the currently
+ * write-locked mmap_lock is dropped or downgraded.
+ */
static inline void vma_start_write(struct vm_area_struct *vma)
{
int mm_lock_seq;
@@ -709,26 +707,17 @@ static inline void vma_start_write(struct vm_area_struct *vma)
up_write(&vma->vm_lock->lock);
}
-static inline bool vma_try_start_write(struct vm_area_struct *vma)
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
int mm_lock_seq;
- if (__is_vma_write_locked(vma, &mm_lock_seq))
- return true;
-
- if (!down_write_trylock(&vma->vm_lock->lock))
- return false;
-
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
- up_write(&vma->vm_lock->lock);
- return true;
+ VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}
-static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+static inline void vma_assert_locked(struct vm_area_struct *vma)
{
- int mm_lock_seq;
-
- VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+ if (!rwsem_is_locked(&vma->vm_lock->lock))
+ vma_assert_write_locked(vma);
}
static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
@@ -739,6 +728,22 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
vma->detached = detached;
}
+static inline void release_fault_lock(struct vm_fault *vmf)
+{
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+ vma_end_read(vmf->vma);
+ else
+ mmap_read_unlock(vmf->vma->vm_mm);
+}
+
+static inline void assert_fault_locked(struct vm_fault *vmf)
+{
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+ vma_assert_locked(vmf->vma);
+ else
+ mmap_assert_locked(vmf->vma->vm_mm);
+}
+
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
unsigned long address);
@@ -748,25 +753,40 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
{ return false; }
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
-static inline bool vma_try_start_write(struct vm_area_struct *vma)
- { return true; }
-static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+ { mmap_assert_write_locked(vma->vm_mm); }
static inline void vma_mark_detached(struct vm_area_struct *vma,
bool detached) {}
+static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
+{
+ return NULL;
+}
+
+static inline void release_fault_lock(struct vm_fault *vmf)
+{
+ mmap_read_unlock(vmf->vma->vm_mm);
+}
+
+static inline void assert_fault_locked(struct vm_fault *vmf)
+{
+ mmap_assert_locked(vmf->vma->vm_mm);
+}
+
#endif /* CONFIG_PER_VMA_LOCK */
+extern const struct vm_operations_struct vma_dummy_vm_ops;
+
/*
* WARNING: vma_init does not initialize vma->vm_lock.
* Use vm_area_alloc()/vm_area_free() if vma needs locking.
*/
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
- static const struct vm_operations_struct dummy_vm_ops = {};
-
memset(vma, 0, sizeof(*vma));
vma->vm_mm = mm;
- vma->vm_ops = &dummy_vm_ops;
+ vma->vm_ops = &vma_dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
vma_mark_detached(vma, false);
vma_numab_state_init(vma);
@@ -779,18 +799,22 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
ACCESS_PRIVATE(vma, __vm_flags) = flags;
}
-/* Use when VMA is part of the VMA tree and modifications need coordination */
+/*
+ * Use when VMA is part of the VMA tree and modifications need coordination
+ * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
+ * it should be locked explicitly beforehand.
+ */
static inline void vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
- vma_start_write(vma);
+ vma_assert_write_locked(vma);
vm_flags_init(vma, flags);
}
static inline void vm_flags_reset_once(struct vm_area_struct *vma,
vm_flags_t flags)
{
- vma_start_write(vma);
+ vma_assert_write_locked(vma);
WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
}
@@ -839,6 +863,31 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
return !vma->vm_ops;
}
+/*
+ * Indicate if the VMA is a heap for the given task; for
+ * /proc/PID/maps that is the heap of the main task.
+ */
+static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
+{
+ return vma->vm_start <= vma->vm_mm->brk &&
+ vma->vm_end >= vma->vm_mm->start_brk;
+}
+
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
+{
+ /*
+ * We make no effort to guess what a given thread considers to be
+ * its "stack". It's not even well-defined for programs written
+ * languages like Go.
+ */
+ return vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack;
+}
+
static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -976,7 +1025,7 @@ struct inode;
* compound_order() can be called without holding a reference, which means
* that niceties like page_folio() don't work. These callers should be
* prepared to handle wild return values. For example, PG_head may be
- * set before _folio_order is initialised, or this may be a tail page.
+ * set before the order is initialised, or this may be a tail page.
* See compaction.c for some good examples.
*/
static inline unsigned int compound_order(struct page *page)
@@ -985,7 +1034,7 @@ static inline unsigned int compound_order(struct page *page)
if (!test_bit(PG_head, &folio->flags))
return 0;
- return folio->_folio_order;
+ return folio->_flags_1 & 0xff;
}
/**
@@ -1001,7 +1050,7 @@ static inline unsigned int folio_order(struct folio *folio)
{
if (!folio_test_large(folio))
return 0;
- return folio->_folio_order;
+ return folio->_flags_1 & 0xff;
}
#include <linux/huge_mm.h>
@@ -1072,11 +1121,6 @@ unsigned long vmalloc_to_pfn(const void *addr);
* On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
* is no special casing required.
*/
-
-#ifndef is_ioremap_addr
-#define is_ioremap_addr(x) is_vmalloc_addr(x)
-#endif
-
#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
@@ -1220,33 +1264,6 @@ void folio_copy(struct folio *dst, struct folio *src);
unsigned long nr_free_buffer_pages(void);
-/*
- * Compound pages have a destructor function. Provide a
- * prototype for that function and accessor functions.
- * These are _only_ valid on the head of a compound page.
- */
-typedef void compound_page_dtor(struct page *);
-
-/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
-enum compound_dtor_id {
- NULL_COMPOUND_DTOR,
- COMPOUND_PAGE_DTOR,
-#ifdef CONFIG_HUGETLB_PAGE
- HUGETLB_PAGE_DTOR,
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- TRANSHUGE_PAGE_DTOR,
-#endif
- NR_COMPOUND_DTORS,
-};
-
-static inline void folio_set_compound_dtor(struct folio *folio,
- enum compound_dtor_id compound_dtor)
-{
- VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
- folio->_folio_dtor = compound_dtor;
-}
-
void destroy_large_folio(struct folio *folio);
/* Returns the number of bytes in this potentially compound page. */
@@ -1282,8 +1299,6 @@ static inline unsigned long thp_size(struct page *page)
return PAGE_SIZE << thp_order(page);
}
-void free_compound_page(struct page *page);
-
#ifdef CONFIG_MMU
/*
* Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
@@ -1299,7 +1314,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
}
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
+void set_pte_range(struct vm_fault *vmf, struct folio *folio,
+ struct page *page, unsigned int nr, unsigned long addr);
vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
@@ -2006,7 +2022,7 @@ static inline long folio_nr_pages(struct folio *folio)
#ifdef CONFIG_64BIT
return folio->_folio_nr_pages;
#else
- return 1L << folio->_folio_order;
+ return 1L << (folio->_flags_1 & 0xff);
#endif
}
@@ -2024,7 +2040,7 @@ static inline unsigned long compound_nr(struct page *page)
#ifdef CONFIG_64BIT
return folio->_folio_nr_pages;
#else
- return 1L << folio->_folio_order;
+ return 1L << (folio->_flags_1 & 0xff);
#endif
}
@@ -2170,7 +2186,6 @@ static inline void *folio_address(const struct folio *folio)
return page_address(&folio->page);
}
-extern void *page_rmapping(struct page *page);
extern pgoff_t __page_file_index(struct page *page);
/*
@@ -2238,18 +2253,6 @@ extern void pagefault_out_of_memory(void);
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))
/*
- * Flags passed to show_mem() and show_free_areas() to suppress output in
- * various contexts.
- */
-#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */
-
-extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
-static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask)
-{
- __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1);
-}
-
-/*
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
@@ -2317,9 +2320,9 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
zap_page_range_single(vma, vma->vm_start,
vma->vm_end - vma->vm_start, NULL);
}
-void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
+void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long start,
- unsigned long end, bool mm_wr_locked);
+ unsigned long end, unsigned long tree_end, bool mm_wr_locked);
struct mmu_notifier_range;
@@ -2766,42 +2769,93 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
}
#endif /* CONFIG_MMU */
+static inline struct ptdesc *virt_to_ptdesc(const void *x)
+{
+ return page_ptdesc(virt_to_page(x));
+}
+
+static inline void *ptdesc_to_virt(const struct ptdesc *pt)
+{
+ return page_to_virt(ptdesc_page(pt));
+}
+
+static inline void *ptdesc_address(const struct ptdesc *pt)
+{
+ return folio_address(ptdesc_folio(pt));
+}
+
+static inline bool pagetable_is_reserved(struct ptdesc *pt)
+{
+ return folio_test_reserved(ptdesc_folio(pt));
+}
+
+/**
+ * pagetable_alloc - Allocate pagetables
+ * @gfp: GFP flags
+ * @order: desired pagetable order
+ *
+ * pagetable_alloc allocates memory for page tables as well as a page table
+ * descriptor to describe that memory.
+ *
+ * Return: The ptdesc describing the allocated page tables.
+ */
+static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+{
+ struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+
+ return page_ptdesc(page);
+}
+
+/**
+ * pagetable_free - Free pagetables
+ * @pt: The page table descriptor
+ *
+ * pagetable_free frees the memory of all page tables described by a page
+ * table descriptor and the memory for the descriptor itself.
+ */
+static inline void pagetable_free(struct ptdesc *pt)
+{
+ struct page *page = ptdesc_page(pt);
+
+ __free_pages(page, compound_order(page));
+}
+
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
-extern bool ptlock_alloc(struct page *page);
-extern void ptlock_free(struct page *page);
+bool ptlock_alloc(struct ptdesc *ptdesc);
+void ptlock_free(struct ptdesc *ptdesc);
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
- return page->ptl;
+ return ptdesc->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}
-static inline bool ptlock_alloc(struct page *page)
+static inline bool ptlock_alloc(struct ptdesc *ptdesc)
{
return true;
}
-static inline void ptlock_free(struct page *page)
+static inline void ptlock_free(struct ptdesc *ptdesc)
{
}
-static inline spinlock_t *ptlock_ptr(struct page *page)
+static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
- return &page->ptl;
+ return &ptdesc->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
- return ptlock_ptr(pmd_page(*pmd));
+ return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}
-static inline bool ptlock_init(struct page *page)
+static inline bool ptlock_init(struct ptdesc *ptdesc)
{
/*
* prep_new_page() initialize page->private (and therefore page->ptl)
@@ -2810,10 +2864,10 @@ static inline bool ptlock_init(struct page *page)
* It can happen if arch try to use slab for page table allocation:
* slab code uses page->slab_cache, which share storage with page->ptl.
*/
- VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
- if (!ptlock_alloc(page))
+ VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc));
+ if (!ptlock_alloc(ptdesc))
return false;
- spin_lock_init(ptlock_ptr(page));
+ spin_lock_init(ptlock_ptr(ptdesc));
return true;
}
@@ -2826,24 +2880,28 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
-static inline bool ptlock_init(struct page *page) { return true; }
-static inline void ptlock_free(struct page *page) {}
+static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
+static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */
-static inline bool pgtable_pte_page_ctor(struct page *page)
+static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
{
- if (!ptlock_init(page))
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ if (!ptlock_init(ptdesc))
return false;
- __SetPageTable(page);
- inc_lruvec_page_state(page, NR_PAGETABLE);
+ __folio_set_pgtable(folio);
+ lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
}
-static inline void pgtable_pte_page_dtor(struct page *page)
+static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
{
- ptlock_free(page);
- __ClearPageTable(page);
- dec_lruvec_page_state(page, NR_PAGETABLE);
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ ptlock_free(ptdesc);
+ __folio_clear_pgtable(folio);
+ lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}
pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
@@ -2892,28 +2950,33 @@ static inline struct page *pmd_pgtable_page(pmd_t *pmd)
return virt_to_page((void *)((unsigned long) pmd & mask));
}
+static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
+{
+ return page_ptdesc(pmd_pgtable_page(pmd));
+}
+
static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
- return ptlock_ptr(pmd_pgtable_page(pmd));
+ return ptlock_ptr(pmd_ptdesc(pmd));
}
-static inline bool pmd_ptlock_init(struct page *page)
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page->pmd_huge_pte = NULL;
+ ptdesc->pmd_huge_pte = NULL;
#endif
- return ptlock_init(page);
+ return ptlock_init(ptdesc);
}
-static inline void pmd_ptlock_free(struct page *page)
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
+ VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc));
#endif
- ptlock_free(page);
+ ptlock_free(ptdesc);
}
-#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte)
+#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)
#else
@@ -2922,8 +2985,8 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
return &mm->page_table_lock;
}
-static inline bool pmd_ptlock_init(struct page *page) { return true; }
-static inline void pmd_ptlock_free(struct page *page) {}
+static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }
+static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {}
#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
@@ -2936,20 +2999,24 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
return ptl;
}
-static inline bool pgtable_pmd_page_ctor(struct page *page)
+static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
{
- if (!pmd_ptlock_init(page))
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ if (!pmd_ptlock_init(ptdesc))
return false;
- __SetPageTable(page);
- inc_lruvec_page_state(page, NR_PAGETABLE);
+ __folio_set_pgtable(folio);
+ lruvec_stat_add_folio(folio, NR_PAGETABLE);
return true;
}
-static inline void pgtable_pmd_page_dtor(struct page *page)
+static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc)
{
- pmd_ptlock_free(page);
- __ClearPageTable(page);
- dec_lruvec_page_state(page, NR_PAGETABLE);
+ struct folio *folio = ptdesc_folio(ptdesc);
+
+ pmd_ptlock_free(ptdesc);
+ __folio_clear_pgtable(folio);
+ lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}
/*
@@ -3004,6 +3071,11 @@ static inline void mark_page_reserved(struct page *page)
adjust_managed_page_count(page, -1);
}
+static inline void free_reserved_ptdesc(struct ptdesc *pt)
+{
+ free_reserved_page(ptdesc_page(pt));
+}
+
/*
* Default method to free all the __init memory into the buddy system.
* The freed pages will be poisoned with pattern "poison" if it's within
@@ -3069,9 +3141,9 @@ extern void mem_init(void);
extern void __init mmap_init(void);
extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
-static inline void show_mem(unsigned int flags, nodemask_t *nodemask)
+static inline void show_mem(void)
{
- __show_mem(flags, nodemask, MAX_NR_ZONES - 1);
+ __show_mem(0, NULL, MAX_NR_ZONES - 1);
}
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
@@ -3509,8 +3581,8 @@ static inline bool debug_pagealloc_enabled(void)
}
/*
- * For use in fast paths after init_debug_pagealloc() has run, or when a
- * false negative result is not harmful when called too early.
+ * For use in fast paths after mem_debugging_and_hardening_init() has run,
+ * or when a false negative result is not harmful when called too early.
*/
static inline bool debug_pagealloc_enabled_static(void)
{
@@ -3665,13 +3737,32 @@ void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap);
#endif
-#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
-static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
- struct dev_pagemap *pgmap)
+#define VMEMMAP_RESERVE_NR 2
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
- return is_power_of_2(sizeof(struct page)) &&
- pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+ unsigned long nr_pages;
+ unsigned long nr_vmemmap_pages;
+
+ if (!pgmap || !is_power_of_2(sizeof(struct page)))
+ return false;
+
+ nr_pages = pgmap_vmemmap_nr(pgmap);
+ nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
+ /*
+ * For vmemmap optimization with DAX we need minimum 2 vmemmap
+ * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
+ */
+ return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
}
+/*
+ * If we don't have an architecture override, use the generic rule
+ */
+#ifndef vmemmap_can_optimize
+#define vmemmap_can_optimize __vmemmap_can_optimize
+#endif
+
#else
static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 21d6c72bcc71..8148b30a9df1 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -523,6 +523,27 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
return atomic_read(&mm->tlb_flush_pending) > 1;
}
+#ifdef CONFIG_MMU
+/*
+ * Computes the pte marker to copy from the given source entry into dst_vma.
+ * If no marker should be copied, returns 0.
+ * The caller should insert a new pte created with make_pte_marker().
+ */
+static inline pte_marker copy_pte_marker(
+ swp_entry_t entry, struct vm_area_struct *dst_vma)
+{
+ pte_marker srcm = pte_marker_get(entry);
+ /* Always copy error entries. */
+ pte_marker dstm = srcm & PTE_MARKER_POISONED;
+
+ /* Only copy PTE markers if UFFD register matches. */
+ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
+ dstm |= PTE_MARKER_UFFD_WP;
+
+ return dstm;
+}
+#endif
+
/*
* If this pte is wr-protected by uffd-wp in any form, arm the special pte to
* replace a none pte. NOTE! This should only be called when *pte is already
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7d30dc4ff0ff..36c5b43999e6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -141,20 +141,6 @@ struct page {
struct { /* Tail pages of compound page */
unsigned long compound_head; /* Bit zero is set */
};
- struct { /* Page table pages */
- unsigned long _pt_pad_1; /* compound_head */
- pgtable_t pmd_huge_pte; /* protected by page->ptl */
- unsigned long _pt_pad_2; /* mapping */
- union {
- struct mm_struct *pt_mm; /* x86 pgds only */
- atomic_t pt_frag_refcount; /* powerpc */
- };
-#if ALLOC_SPLIT_PTLOCKS
- spinlock_t *ptl;
-#else
- spinlock_t ptl;
-#endif
- };
struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
@@ -262,6 +248,14 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page)
return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page);
}
+/*
+ * A swap entry has to fit into a "unsigned long", as the entry is hidden
+ * in the "index" field of the swapper address space.
+ */
+typedef struct {
+ unsigned long val;
+} swp_entry_t;
+
/**
* struct folio - Represents a contiguous set of bytes.
* @flags: Identical to the page flags.
@@ -272,14 +266,12 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page)
* @index: Offset within the file, in units of pages. For anonymous memory,
* this is the index from the beginning of the mmap.
* @private: Filesystem per-folio data (see folio_attach_private()).
- * Used for swp_entry_t if folio_test_swapcache().
+ * @swap: Used for swp_entry_t if folio_test_swapcache().
* @_mapcount: Do not access this member directly. Use folio_mapcount() to
* find out how many times this folio is mapped by userspace.
* @_refcount: Do not access this member directly. Use folio_ref_count()
* to find how many references there are to this folio.
* @memcg_data: Memory Control Group data.
- * @_folio_dtor: Which destructor to use for this folio.
- * @_folio_order: Do not use directly, call folio_order().
* @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
* @_nr_pages_mapped: Do not use directly, call folio_mapcount().
* @_pincount: Do not use directly, call folio_maybe_dma_pinned().
@@ -317,7 +309,10 @@ struct folio {
};
struct address_space *mapping;
pgoff_t index;
- void *private;
+ union {
+ void *private;
+ swp_entry_t swap;
+ };
atomic_t _mapcount;
atomic_t _refcount;
#ifdef CONFIG_MEMCG
@@ -331,9 +326,8 @@ struct folio {
struct {
unsigned long _flags_1;
unsigned long _head_1;
+ unsigned long _folio_avail;
/* public: */
- unsigned char _folio_dtor;
- unsigned char _folio_order;
atomic_t _entire_mapcount;
atomic_t _nr_pages_mapped;
atomic_t _pincount;
@@ -391,8 +385,89 @@ FOLIO_MATCH(compound_head, _head_1);
offsetof(struct page, pg) + 2 * sizeof(struct page))
FOLIO_MATCH(flags, _flags_2);
FOLIO_MATCH(compound_head, _head_2);
+FOLIO_MATCH(flags, _flags_2a);
+FOLIO_MATCH(compound_head, _head_2a);
#undef FOLIO_MATCH
+/**
+ * struct ptdesc - Memory descriptor for page tables.
+ * @__page_flags: Same as page flags. Unused for page tables.
+ * @pt_rcu_head: For freeing page table pages.
+ * @pt_list: List of used page tables. Used for s390 and x86.
+ * @_pt_pad_1: Padding that aliases with page's compound head.
+ * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
+ * @__page_mapping: Aliases with page->mapping. Unused for page tables.
+ * @pt_mm: Used for x86 pgds.
+ * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only.
+ * @_pt_pad_2: Padding to ensure proper alignment.
+ * @ptl: Lock for the page table.
+ * @__page_type: Same as page->page_type. Unused for page tables.
+ * @_refcount: Same as page refcount. Used for s390 page tables.
+ * @pt_memcg_data: Memcg data. Tracked for page tables here.
+ *
+ * This struct overlays struct page for now. Do not modify without a good
+ * understanding of the issues.
+ */
+struct ptdesc {
+ unsigned long __page_flags;
+
+ union {
+ struct rcu_head pt_rcu_head;
+ struct list_head pt_list;
+ struct {
+ unsigned long _pt_pad_1;
+ pgtable_t pmd_huge_pte;
+ };
+ };
+ unsigned long __page_mapping;
+
+ union {
+ struct mm_struct *pt_mm;
+ atomic_t pt_frag_refcount;
+ };
+
+ union {
+ unsigned long _pt_pad_2;
+#if ALLOC_SPLIT_PTLOCKS
+ spinlock_t *ptl;
+#else
+ spinlock_t ptl;
+#endif
+ };
+ unsigned int __page_type;
+ atomic_t _refcount;
+#ifdef CONFIG_MEMCG
+ unsigned long pt_memcg_data;
+#endif
+};
+
+#define TABLE_MATCH(pg, pt) \
+ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
+TABLE_MATCH(flags, __page_flags);
+TABLE_MATCH(compound_head, pt_list);
+TABLE_MATCH(compound_head, _pt_pad_1);
+TABLE_MATCH(mapping, __page_mapping);
+TABLE_MATCH(rcu_head, pt_rcu_head);
+TABLE_MATCH(page_type, __page_type);
+TABLE_MATCH(_refcount, _refcount);
+#ifdef CONFIG_MEMCG
+TABLE_MATCH(memcg_data, pt_memcg_data);
+#endif
+#undef TABLE_MATCH
+static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
+
+#define ptdesc_page(pt) (_Generic((pt), \
+ const struct ptdesc *: (const struct page *)(pt), \
+ struct ptdesc *: (struct page *)(pt)))
+
+#define ptdesc_folio(pt) (_Generic((pt), \
+ const struct ptdesc *: (const struct folio *)(pt), \
+ struct ptdesc *: (struct folio *)(pt)))
+
+#define page_ptdesc(p) (_Generic((p), \
+ const struct page *: (const struct ptdesc *)(p), \
+ struct page *: (struct ptdesc *)(p)))
+
/*
* Used for sizing the vmemmap region on some architectures
*/
@@ -812,7 +887,7 @@ struct mm_struct {
#ifdef CONFIG_KSM
/*
* Represent how many pages of this process are involved in KSM
- * merging.
+ * merging (not including ksm_zero_pages).
*/
unsigned long ksm_merging_pages;
/*
@@ -820,7 +895,12 @@ struct mm_struct {
* including merged and not merged.
*/
unsigned long ksm_rmap_items;
-#endif
+ /*
+ * Represent how many empty pages are merged with kernel zero
+ * pages when enabling KSM use_zero_pages.
+ */
+ unsigned long ksm_zero_pages;
+#endif /* CONFIG_KSM */
#ifdef CONFIG_LRU_GEN
struct {
/* this mm_struct is on lru_gen_mm_list */
@@ -1105,7 +1185,8 @@ enum vm_fault_reason {
{ VM_FAULT_RETRY, "RETRY" }, \
{ VM_FAULT_FALLBACK, "FALLBACK" }, \
{ VM_FAULT_DONE_COW, "DONE_COW" }, \
- { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
+ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \
+ { VM_FAULT_COMPLETED, "COMPLETED" }
struct vm_special_mapping {
const char *name; /* The name, e.g. "[vdso]". */
@@ -1139,14 +1220,6 @@ enum tlb_flush_reason {
NR_TLB_FLUSH_REASONS,
};
- /*
- * A swap entry has to fit into a "unsigned long", as the entry is hidden
- * in the "index" field of the swapper address space.
- */
-typedef struct {
- unsigned long val;
-} swp_entry_t;
-
/**
* enum fault_flag - Fault flag definitions.
* @FAULT_FLAG_WRITE: Fault was a write fault.
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index 5414b5c6a103..aa44fff8bb9d 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -52,8 +52,8 @@ struct tlbflush_unmap_batch {
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
* The arch code makes the following promise: generic code can modify a
- * PTE, then call arch_tlbbatch_add_mm() (which internally provides all
- * needed barriers), then call arch_tlbbatch_flush(), and the entries
+ * PTE, then call arch_tlbbatch_add_pending() (which internally provides
+ * all needed barriers), then call arch_tlbbatch_flush(), and the entries
* will be flushed on all CPUs by the time that arch_tlbbatch_flush()
* returns.
*/
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index e05e167dbd16..8d38dcb6d044 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -73,6 +73,14 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
}
#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Drop all currently-held per-VMA locks.
+ * This is called from the mmap_lock implementation directly before releasing
+ * a write-locked mmap_lock (or downgrading it to read-locked).
+ * This should normally NOT be called manually from other places.
+ * If you want to call this manually anyway, keep in mind that this will release
+ * *all* VMA write locks, including ones from further up the stack.
+ */
static inline void vma_end_write_all(struct mm_struct *mm)
{
mmap_assert_write_locked(mm);
@@ -118,16 +126,6 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm)
return ret;
}
-static inline bool mmap_write_trylock(struct mm_struct *mm)
-{
- bool ret;
-
- __mmap_lock_trace_start_locking(mm, true);
- ret = down_write_trylock(&mm->mmap_lock) != 0;
- __mmap_lock_trace_acquire_returned(mm, true, ret);
- return ret;
-}
-
static inline void mmap_write_unlock(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, true);
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 64a3e051c3c4..6e3c857606f1 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -187,27 +187,27 @@ struct mmu_notifier_ops {
const struct mmu_notifier_range *range);
/*
- * invalidate_range() is either called between
- * invalidate_range_start() and invalidate_range_end() when the
- * VM has to free pages that where unmapped, but before the
- * pages are actually freed, or outside of _start()/_end() when
- * a (remote) TLB is necessary.
+ * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB
+ * which shares page-tables with the CPU. The
+ * invalidate_range_start()/end() callbacks should not be implemented as
+ * invalidate_secondary_tlbs() already catches the points in time when
+ * an external TLB needs to be flushed.
*
- * If invalidate_range() is used to manage a non-CPU TLB with
- * shared page-tables, it not necessary to implement the
- * invalidate_range_start()/end() notifiers, as
- * invalidate_range() already catches the points in time when an
- * external TLB range needs to be flushed. For more in depth
- * discussion on this see Documentation/mm/mmu_notifier.rst
+ * This requires arch_invalidate_secondary_tlbs() to be called while
+ * holding the ptl spin-lock and therefore this callback is not allowed
+ * to sleep.
*
- * Note that this function might be called with just a sub-range
- * of what was passed to invalidate_range_start()/end(), if
- * called between those functions.
+ * This is called by architecture code whenever invalidating a TLB
+ * entry. It is assumed that any secondary TLB has the same rules for
+ * when invalidations are required. If this is not the case architecture
+ * code will need to call this explicitly when required for secondary
+ * TLB invalidation.
*/
- void (*invalidate_range)(struct mmu_notifier *subscription,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end);
+ void (*arch_invalidate_secondary_tlbs)(
+ struct mmu_notifier *subscription,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end);
/*
* These callbacks are used with the get/put interface to manage the
@@ -395,10 +395,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm,
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
-extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r,
- bool only_end);
-extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
- unsigned long start, unsigned long end);
+extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r);
+extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
+ unsigned long start, unsigned long end);
extern bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);
@@ -481,21 +480,14 @@ mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
might_sleep();
if (mm_has_notifiers(range->mm))
- __mmu_notifier_invalidate_range_end(range, false);
-}
-
-static inline void
-mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
-{
- if (mm_has_notifiers(range->mm))
- __mmu_notifier_invalidate_range_end(range, true);
+ __mmu_notifier_invalidate_range_end(range);
}
-static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
- unsigned long start, unsigned long end)
+static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
if (mm_has_notifiers(mm))
- __mmu_notifier_invalidate_range(mm, start, end);
+ __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
@@ -582,45 +574,6 @@ static inline void mmu_notifier_range_init_owner(
__young; \
})
-#define ptep_clear_flush_notify(__vma, __address, __ptep) \
-({ \
- unsigned long ___addr = __address & PAGE_MASK; \
- struct mm_struct *___mm = (__vma)->vm_mm; \
- pte_t ___pte; \
- \
- ___pte = ptep_clear_flush(__vma, __address, __ptep); \
- mmu_notifier_invalidate_range(___mm, ___addr, \
- ___addr + PAGE_SIZE); \
- \
- ___pte; \
-})
-
-#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \
-({ \
- unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \
- struct mm_struct *___mm = (__vma)->vm_mm; \
- pmd_t ___pmd; \
- \
- ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \
- mmu_notifier_invalidate_range(___mm, ___haddr, \
- ___haddr + HPAGE_PMD_SIZE); \
- \
- ___pmd; \
-})
-
-#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \
-({ \
- unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \
- struct mm_struct *___mm = (__vma)->vm_mm; \
- pud_t ___pud; \
- \
- ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \
- mmu_notifier_invalidate_range(___mm, ___haddr, \
- ___haddr + HPAGE_PUD_SIZE); \
- \
- ___pud; \
-})
-
/*
* set_pte_at_notify() sets the pte _after_ running the notifier.
* This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -711,12 +664,7 @@ void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
}
-static inline void
-mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range)
-{
-}
-
-static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
+static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5e50b78d58ea..4106fbc5b4b3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -676,7 +676,6 @@ enum zone_watermarks {
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
-/* Fields and list protected by pagesets local_lock in page_alloc.c */
struct per_cpu_pages {
spinlock_t lock; /* Protects lists field */
int count; /* number of pages in the list */
diff --git a/include/linux/net_mm.h b/include/linux/net_mm.h
deleted file mode 100644
index b298998bd5a0..000000000000
--- a/include/linux/net_mm.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifdef CONFIG_MMU
-
-#ifdef CONFIG_INET
-extern const struct vm_operations_struct tcp_vm_ops;
-static inline bool vma_is_tcp(const struct vm_area_struct *vma)
-{
- return vma->vm_ops == &tcp_vm_ops;
-}
-#else
-static inline bool vma_is_tcp(const struct vm_area_struct *vma)
-{
- return false;
-}
-#endif /* CONFIG_INET*/
-
-#endif /* CONFIG_MMU */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 92a2063a0a23..5c02720c53a5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -99,13 +99,15 @@
*/
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
+ PG_writeback, /* Page is under writeback */
PG_referenced,
PG_uptodate,
PG_dirty,
PG_lru,
+ PG_head, /* Must be in bit 6 */
+ PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
PG_active,
PG_workingset,
- PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
PG_error,
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
@@ -113,8 +115,6 @@ enum pageflags {
PG_reserved,
PG_private, /* If pagecache, has fs-private data */
PG_private_2, /* If pagecache, has fs aux data */
- PG_writeback, /* Page is under writeback */
- PG_head, /* A head page */
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
PG_swapbacked, /* Page is backed by RAM/swap */
@@ -171,15 +171,6 @@ enum pageflags {
/* Remapped by swiotlb-xen. */
PG_xen_remapped = PG_owner_priv_1,
-#ifdef CONFIG_MEMORY_FAILURE
- /*
- * Compound pages. Stored in first tail page's flags.
- * Indicates that at least one subpage is hwpoisoned in the
- * THP.
- */
- PG_has_hwpoisoned = PG_error,
-#endif
-
/* non-lru isolated movable page */
PG_isolated = PG_reclaim,
@@ -190,6 +181,17 @@ enum pageflags {
/* For self-hosted memmap pages */
PG_vmemmap_self_hosted = PG_owner_priv_1,
#endif
+
+ /*
+ * Flags only valid for compound pages. Stored in first tail page's
+ * flags word. Cannot use the first 8 flags or any flag marked as
+ * PF_ANY.
+ */
+
+ /* At least one page in this folio has the hwpoison flag set */
+ PG_has_hwpoisoned = PG_error,
+ PG_hugetlb = PG_active,
+ PG_large_rmappable = PG_workingset, /* anon or file-backed */
};
#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
@@ -806,13 +808,32 @@ static inline void ClearPageCompound(struct page *page)
BUG_ON(!PageHead(page));
ClearPageHead(page);
}
+PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND)
+#else
+TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable)
#endif
#define PG_head_mask ((1UL << PG_head))
#ifdef CONFIG_HUGETLB_PAGE
int PageHuge(struct page *page);
-bool folio_test_hugetlb(struct folio *folio);
+SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
+CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
+
+/**
+ * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
+ * @folio: The folio to test.
+ *
+ * Context: Any context. Caller should have a reference on the folio to
+ * prevent it from being turned into a tail page.
+ * Return: True for hugetlbfs folios, false for anon folios or folios
+ * belonging to other filesystems.
+ */
+static inline bool folio_test_hugetlb(struct folio *folio)
+{
+ return folio_test_large(folio) &&
+ test_bit(PG_hugetlb, folio_flags(folio, 1));
+}
#else
TESTPAGEFLAG_FALSE(Huge, hugetlb)
#endif
@@ -832,11 +853,6 @@ static inline int PageTransHuge(struct page *page)
return PageHead(page);
}
-static inline bool folio_test_transhuge(struct folio *folio)
-{
- return folio_test_head(folio);
-}
-
/*
* PageTransCompound returns true for both transparent huge pages
* and hugetlbfs pages, so it should only be called when it's known
@@ -908,6 +924,8 @@ static inline bool is_page_hwpoison(struct page *page)
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
+#define folio_test_type(folio, flag) \
+ ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
static inline int page_type_has_type(unsigned int page_type)
{
@@ -919,27 +937,41 @@ static inline int page_has_type(struct page *page)
return page_type_has_type(page->page_type);
}
-#define PAGE_TYPE_OPS(uname, lname) \
-static __always_inline int Page##uname(struct page *page) \
+#define PAGE_TYPE_OPS(uname, lname, fname) \
+static __always_inline int Page##uname(const struct page *page) \
{ \
return PageType(page, PG_##lname); \
} \
+static __always_inline int folio_test_##fname(const struct folio *folio)\
+{ \
+ return folio_test_type(folio, PG_##lname); \
+} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!PageType(page, 0), page); \
page->page_type &= ~PG_##lname; \
} \
+static __always_inline void __folio_set_##fname(struct folio *folio) \
+{ \
+ VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \
+ folio->page.page_type &= ~PG_##lname; \
+} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type |= PG_##lname; \
-}
+} \
+static __always_inline void __folio_clear_##fname(struct folio *folio) \
+{ \
+ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
+ folio->page.page_type |= PG_##lname; \
+} \
/*
* PageBuddy() indicates that the page is free and in the buddy system
* (see mm/page_alloc.c).
*/
-PAGE_TYPE_OPS(Buddy, buddy)
+PAGE_TYPE_OPS(Buddy, buddy, buddy)
/*
* PageOffline() indicates that the page is logically offline although the
@@ -963,7 +995,7 @@ PAGE_TYPE_OPS(Buddy, buddy)
* pages should check PageOffline() and synchronize with such drivers using
* page_offline_freeze()/page_offline_thaw().
*/
-PAGE_TYPE_OPS(Offline, offline)
+PAGE_TYPE_OPS(Offline, offline, offline)
extern void page_offline_freeze(void);
extern void page_offline_thaw(void);
@@ -973,12 +1005,12 @@ extern void page_offline_end(void);
/*
* Marks pages in use as page tables.
*/
-PAGE_TYPE_OPS(Table, table)
+PAGE_TYPE_OPS(Table, table, pgtable)
/*
* Marks guardpages used with debug_pagealloc.
*/
-PAGE_TYPE_OPS(Guard, guard)
+PAGE_TYPE_OPS(Guard, guard, guard)
extern bool is_free_buddy_page(struct page *page);
@@ -1040,6 +1072,14 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
#define PAGE_FLAGS_CHECK_AT_PREP \
((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
+/*
+ * Flags stored in the second page of a compound page. They may overlap
+ * the CHECK_AT_FREE flags above, so need to be cleared.
+ */
+#define PAGE_FLAGS_SECOND \
+ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \
+ 1UL << PG_hugetlb | 1UL << PG_large_rmappable)
+
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
/**
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 67314f648aeb..be98564191e6 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -8,6 +8,7 @@
struct pglist_data;
+#ifdef CONFIG_PAGE_EXTENSION
/**
* struct page_ext_operations - per page_ext client operations
* @offset: Offset to the client's data within page_ext. Offset is returned to
@@ -29,8 +30,6 @@ struct page_ext_operations {
bool need_shared_flags;
};
-#ifdef CONFIG_PAGE_EXTENSION
-
/*
* The page_ext_flags users must set need_shared_flags to true.
*/
@@ -82,6 +81,12 @@ static inline void page_ext_init(void)
extern struct page_ext *page_ext_get(struct page *page);
extern void page_ext_put(struct page_ext *page_ext);
+static inline void *page_ext_data(struct page_ext *page_ext,
+ struct page_ext_operations *ops)
+{
+ return (void *)(page_ext) + ops->offset;
+}
+
static inline struct page_ext *page_ext_next(struct page_ext *curr)
{
void *next = curr;
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 5cb7bd2078ec..d8f344840643 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -144,9 +144,4 @@ static inline void set_page_idle(struct page *page)
{
folio_set_idle(page_folio(page));
}
-
-static inline void clear_page_idle(struct page *page)
-{
- folio_clear_idle(page_folio(page));
-}
#endif /* _LINUX_MM_PAGE_IDLE_H */
diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h
index 01e16c7696ec..6722941c7cb8 100644
--- a/include/linux/page_table_check.h
+++ b/include/linux/page_table_check.h
@@ -14,18 +14,13 @@ extern struct static_key_true page_table_check_disabled;
extern struct page_ext_operations page_table_check_ops;
void __page_table_check_zero(struct page *page, unsigned int order);
-void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t pte);
-void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
- pmd_t pmd);
-void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
- pud_t pud);
-void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte);
-void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmdp, pmd_t pmd);
-void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
- pud_t *pudp, pud_t pud);
+void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte);
+void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd);
+void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud);
+void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
+ unsigned int nr);
+void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd);
+void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
unsigned long addr,
pmd_t pmd);
@@ -46,61 +41,55 @@ static inline void page_table_check_free(struct page *page, unsigned int order)
__page_table_check_zero(page, order);
}
-static inline void page_table_check_pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t pte)
+static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pte_clear(mm, addr, pte);
+ __page_table_check_pte_clear(mm, pte);
}
-static inline void page_table_check_pmd_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t pmd)
+static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pmd_clear(mm, addr, pmd);
+ __page_table_check_pmd_clear(mm, pmd);
}
-static inline void page_table_check_pud_clear(struct mm_struct *mm,
- unsigned long addr, pud_t pud)
+static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pud_clear(mm, addr, pud);
+ __page_table_check_pud_clear(mm, pud);
}
-static inline void page_table_check_pte_set(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep,
- pte_t pte)
+static inline void page_table_check_ptes_set(struct mm_struct *mm,
+ pte_t *ptep, pte_t pte, unsigned int nr)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pte_set(mm, addr, ptep, pte);
+ __page_table_check_ptes_set(mm, ptep, pte, nr);
}
-static inline void page_table_check_pmd_set(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp,
+static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp,
pmd_t pmd)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pmd_set(mm, addr, pmdp, pmd);
+ __page_table_check_pmd_set(mm, pmdp, pmd);
}
-static inline void page_table_check_pud_set(struct mm_struct *mm,
- unsigned long addr, pud_t *pudp,
+static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp,
pud_t pud)
{
if (static_branch_likely(&page_table_check_disabled))
return;
- __page_table_check_pud_set(mm, addr, pudp, pud);
+ __page_table_check_pud_set(mm, pudp, pud);
}
static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
@@ -123,35 +112,29 @@ static inline void page_table_check_free(struct page *page, unsigned int order)
{
}
-static inline void page_table_check_pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t pte)
+static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
}
-static inline void page_table_check_pmd_clear(struct mm_struct *mm,
- unsigned long addr, pmd_t pmd)
+static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
}
-static inline void page_table_check_pud_clear(struct mm_struct *mm,
- unsigned long addr, pud_t pud)
+static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
}
-static inline void page_table_check_pte_set(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep,
- pte_t pte)
+static inline void page_table_check_ptes_set(struct mm_struct *mm,
+ pte_t *ptep, pte_t pte, unsigned int nr)
{
}
-static inline void page_table_check_pmd_set(struct mm_struct *mm,
- unsigned long addr, pmd_t *pmdp,
+static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp,
pmd_t pmd)
{
}
-static inline void page_table_check_pud_set(struct mm_struct *mm,
- unsigned long addr, pud_t *pudp,
+static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp,
pud_t pud)
{
}
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d87840acbfb2..351c3b7f93a1 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -203,6 +203,7 @@ enum mapping_flags {
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
AS_LARGE_FOLIO_SUPPORT = 6,
+ AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */
};
/**
@@ -273,6 +274,21 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping)
return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}
+static inline bool mapping_release_always(const struct address_space *mapping)
+{
+ return test_bit(AS_RELEASE_ALWAYS, &mapping->flags);
+}
+
+static inline void mapping_set_release_always(struct address_space *mapping)
+{
+ set_bit(AS_RELEASE_ALWAYS, &mapping->flags);
+}
+
+static inline void mapping_clear_release_always(struct address_space *mapping)
+{
+ clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
+}
+
static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
return mapping->gfp_mask;
@@ -373,23 +389,31 @@ static inline struct address_space *folio_file_mapping(struct folio *folio)
return folio->mapping;
}
-static inline struct address_space *page_file_mapping(struct page *page)
-{
- return folio_file_mapping(page_folio(page));
-}
-
-/*
- * For file cache pages, return the address_space, otherwise return NULL
+/**
+ * folio_flush_mapping - Find the file mapping this folio belongs to.
+ * @folio: The folio.
+ *
+ * For folios which are in the page cache, return the mapping that this
+ * page belongs to. Anonymous folios return NULL, even if they're in
+ * the swap cache. Other kinds of folio also return NULL.
+ *
+ * This is ONLY used by architecture cache flushing code. If you aren't
+ * writing cache flushing code, you want either folio_mapping() or
+ * folio_file_mapping().
*/
-static inline struct address_space *page_mapping_file(struct page *page)
+static inline struct address_space *folio_flush_mapping(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
if (unlikely(folio_test_swapcache(folio)))
return NULL;
+
return folio_mapping(folio);
}
+static inline struct address_space *page_file_mapping(struct page *page)
+{
+ return folio_file_mapping(page_folio(page));
+}
+
/**
* folio_inode - Get the host inode for this folio.
* @folio: The folio.
@@ -960,8 +984,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page,
void __folio_lock(struct folio *folio);
int __folio_lock_killable(struct folio *folio);
-bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
- unsigned int flags);
+vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf);
void unlock_page(struct page *page);
void folio_unlock(struct folio *folio);
@@ -1065,11 +1088,13 @@ static inline int folio_lock_killable(struct folio *folio)
* Return value and mmap_lock implications depend on flags; see
* __folio_lock_or_retry().
*/
-static inline bool folio_lock_or_retry(struct folio *folio,
- struct mm_struct *mm, unsigned int flags)
+static inline vm_fault_t folio_lock_or_retry(struct folio *folio,
+ struct vm_fault *vmf)
{
might_sleep();
- return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags);
+ if (!folio_trylock(folio))
+ return __folio_lock_or_retry(folio, vmf);
+ return 0;
}
/*
@@ -1104,11 +1129,6 @@ static inline void wait_on_page_locked(struct page *page)
folio_wait_locked(page_folio(page));
}
-static inline int wait_on_page_locked_killable(struct page *page)
-{
- return folio_wait_locked_killable(page_folio(page));
-}
-
void wait_on_page_writeback(struct page *page);
void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5063b482e34f..f49abcfe5eda 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -5,6 +5,9 @@
#include <linux/pfn.h>
#include <asm/pgtable.h>
+#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
+#define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT)
+
#ifndef __ASSEMBLY__
#ifdef CONFIG_MMU
@@ -63,7 +66,6 @@ static inline unsigned long pte_index(unsigned long address)
{
return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}
-#define pte_index pte_index
#ifndef pmd_index
static inline unsigned long pmd_index(unsigned long address)
@@ -99,7 +101,7 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address)))
#define pte_unmap(pte) do { \
kunmap_local((pte)); \
- /* rcu_read_unlock() to be added later */ \
+ rcu_read_unlock(); \
} while (0)
#else
static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
@@ -108,10 +110,12 @@ static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
}
static inline void pte_unmap(pte_t *pte)
{
- /* rcu_read_unlock() to be added later */
+ rcu_read_unlock();
}
#endif
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);
+
/* Find an entry in the second-level page table.. */
#ifndef pmd_offset
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@ -180,6 +184,60 @@ static inline int pmd_young(pmd_t pmd)
}
#endif
+/*
+ * A facility to provide lazy MMU batching. This allows PTE updates and
+ * page invalidations to be delayed until a call to leave lazy MMU mode
+ * is issued. Some architectures may benefit from doing this, and it is
+ * beneficial for both shadow and direct mode hypervisors, which may batch
+ * the PTE updates which happen during this window. Note that using this
+ * interface requires that read hazards be removed from the code. A read
+ * hazard could result in the direct mode hypervisor case, since the actual
+ * write to the page tables may not yet have taken place, so reads though
+ * a raw PTE pointer after it has been modified are not guaranteed to be
+ * up to date. This mode can only be entered and left under the protection of
+ * the page table locks for all page tables which may be modified. In the UP
+ * case, this is required so that preemption is disabled, and in the SMP case,
+ * it must synchronize the delayed page table writes properly on other CPUs.
+ */
+#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+#define arch_enter_lazy_mmu_mode() do {} while (0)
+#define arch_leave_lazy_mmu_mode() do {} while (0)
+#define arch_flush_lazy_mmu_mode() do {} while (0)
+#endif
+
+#ifndef set_ptes
+/**
+ * set_ptes - Map consecutive pages to a contiguous range of addresses.
+ * @mm: Address space to map the pages into.
+ * @addr: Address to map the first page at.
+ * @ptep: Page table pointer for the first entry.
+ * @pte: Page table entry for the first page.
+ * @nr: Number of pages to map.
+ *
+ * May be overridden by the architecture, or the architecture can define
+ * set_pte() and PFN_PTE_SHIFT.
+ *
+ * Context: The caller holds the page table lock. The pages all belong
+ * to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr)
+{
+ page_table_check_ptes_set(mm, ptep, pte, nr);
+
+ arch_enter_lazy_mmu_mode();
+ for (;;) {
+ set_pte(ptep, pte);
+ if (--nr == 0)
+ break;
+ ptep++;
+ pte = __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+ }
+ arch_leave_lazy_mmu_mode();
+}
+#endif
+#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)
+
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
@@ -320,7 +378,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
{
pte_t pte = ptep_get(ptep);
pte_clear(mm, address, ptep);
- page_table_check_pte_clear(mm, address, pte);
+ page_table_check_pte_clear(mm, pte);
return pte;
}
#endif
@@ -390,6 +448,7 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
return pmd;
}
#define pmdp_get_lockless pmdp_get_lockless
+#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
@@ -408,6 +467,9 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
return pmdp_get(pmdp);
}
+static inline void pmdp_get_lockless_sync(void)
+{
+}
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -419,7 +481,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
pmd_t pmd = *pmdp;
pmd_clear(pmdp);
- page_table_check_pmd_clear(mm, address, pmd);
+ page_table_check_pmd_clear(mm, pmd);
return pmd;
}
@@ -432,7 +494,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
pud_t pud = *pudp;
pud_clear(pudp);
- page_table_check_pud_clear(mm, address, pud);
+ page_table_check_pud_clear(mm, pud);
return pud;
}
@@ -450,11 +512,11 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
#endif
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
-static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
int full)
{
- return pudp_huge_get_and_clear(mm, address, pudp);
+ return pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -558,6 +620,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
#endif
#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pudp_set_wrprotect(struct mm_struct *mm,
unsigned long address, pud_t *pudp)
{
@@ -571,6 +634,7 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm,
{
BUILD_BUG();
}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif
@@ -693,11 +757,14 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
return pmd_val(pmd_a) == pmd_val(pmd_b);
}
+#endif
+#ifndef pud_same
static inline int pud_same(pud_t pud_a, pud_t pud_b)
{
return pud_val(pud_a) == pud_val(pud_b);
}
+#define pud_same pud_same
#endif
#ifndef __HAVE_ARCH_P4D_SAME
@@ -1041,27 +1108,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
#endif
/*
- * A facility to provide lazy MMU batching. This allows PTE updates and
- * page invalidations to be delayed until a call to leave lazy MMU mode
- * is issued. Some architectures may benefit from doing this, and it is
- * beneficial for both shadow and direct mode hypervisors, which may batch
- * the PTE updates which happen during this window. Note that using this
- * interface requires that read hazards be removed from the code. A read
- * hazard could result in the direct mode hypervisor case, since the actual
- * write to the page tables may not yet have taken place, so reads though
- * a raw PTE pointer after it has been modified are not guaranteed to be
- * up to date. This mode can only be entered and left under the protection of
- * the page table locks for all page tables which may be modified. In the UP
- * case, this is required so that preemption is disabled, and in the SMP case,
- * it must synchronize the delayed page table writes properly on other CPUs.
- */
-#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
-#define arch_enter_lazy_mmu_mode() do {} while (0)
-#define arch_leave_lazy_mmu_mode() do {} while (0)
-#define arch_flush_lazy_mmu_mode() do {} while (0)
-#endif
-
-/*
* A facility to provide batching of the reload of page tables and
* other process state with the actual context switch code for
* paravirtualized guests. By convention, only one of the batched
@@ -1322,12 +1368,16 @@ static inline int pud_trans_unstable(pud_t *pud)
#ifndef CONFIG_NUMA_BALANCING
/*
- * Technically a PTE can be PROTNONE even when not doing NUMA balancing but
- * the only case the kernel cares is for NUMA balancing and is only ever set
- * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked
- * _PAGE_PROTNONE so by default, implement the helper as "always no". It
- * is the responsibility of the caller to distinguish between PROT_NONE
- * protections and NUMA hinting fault protections.
+ * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is
+ * perfectly valid to indicate "no" in that case, which is why our default
+ * implementation defaults to "always no".
+ *
+ * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE
+ * page protection due to NUMA hinting. NUMA hinting faults only apply in
+ * accessible VMAs.
+ *
+ * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault,
+ * looking at the VMA accessibility is sufficient.
*/
static inline int pte_protnone(pte_t pte)
{
@@ -1499,6 +1549,9 @@ typedef unsigned int pgtbl_mod_mask;
#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
#endif
+#ifndef has_transparent_pud_hugepage
+#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+#endif
/*
* On some architectures it depends on the mm if the p4d/pud or pmd
* layer of the page table hierarchy is folded or not.
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c758809d5bcf..f9f9931e02d6 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -17,18 +17,10 @@
struct fs_pin;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
-/*
- * sysctl for vm.memfd_noexec
- * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
- * acts like MFD_EXEC was set.
- * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL
- * acts like MFD_NOEXEC_SEAL was set.
- * 2: memfd_create() without MFD_NOEXEC_SEAL will be
- * rejected.
- */
-#define MEMFD_NOEXEC_SCOPE_EXEC 0
-#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1
-#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2
+/* modes for vm.memfd_noexec sysctl */
+#define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */
+#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */
+#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */
#endif
struct pid_namespace {
@@ -47,7 +39,6 @@ struct pid_namespace {
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
- /* sysctl for vm.memfd_noexec */
int memfd_noexec_scope;
#endif
} __randomize_layout;
@@ -64,6 +55,23 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
return ns;
}
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
+{
+ int scope = MEMFD_NOEXEC_SCOPE_EXEC;
+
+ for (; ns; ns = ns->parent)
+ scope = max(scope, READ_ONCE(ns->memfd_noexec_scope));
+
+ return scope;
+}
+#else
+static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
+{
+ return 0;
+}
+#endif
+
extern struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *ns);
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
@@ -78,6 +86,11 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
return ns;
}
+static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
+{
+ return 0;
+}
+
static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *ns)
{
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b87d01660412..a3825ce81102 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -198,6 +198,8 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
unsigned long address);
void page_add_file_rmap(struct page *, struct vm_area_struct *,
bool compound);
+void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
+ struct vm_area_struct *, bool compound);
void page_remove_rmap(struct page *, struct vm_area_struct *,
bool compound);
diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
index 988528b5da43..35f3a4a8ceb1 100644
--- a/include/linux/secretmem.h
+++ b/include/linux/secretmem.h
@@ -6,24 +6,23 @@
extern const struct address_space_operations secretmem_aops;
-static inline bool page_is_secretmem(struct page *page)
+static inline bool folio_is_secretmem(struct folio *folio)
{
struct address_space *mapping;
/*
- * Using page_mapping() is quite slow because of the actual call
- * instruction and repeated compound_head(page) inside the
- * page_mapping() function.
+ * Using folio_mapping() is quite slow because of the actual call
+ * instruction.
* We know that secretmem pages are not compound and LRU so we can
* save a couple of cycles here.
*/
- if (PageCompound(page) || !PageLRU(page))
+ if (folio_test_large(folio) || !folio_test_lru(folio))
return false;
mapping = (struct address_space *)
- ((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS);
+ ((unsigned long)folio->mapping & ~PAGE_MAPPING_FLAGS);
- if (!mapping || mapping != page->mapping)
+ if (!mapping || mapping != folio->mapping)
return false;
return mapping->a_ops == &secretmem_aops;
@@ -39,7 +38,7 @@ static inline bool vma_is_secretmem(struct vm_area_struct *vma)
return false;
}
-static inline bool page_is_secretmem(struct page *page)
+static inline bool folio_is_secretmem(struct folio *folio)
{
return false;
}
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 456546443f1f..493487ed7c38 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -302,10 +302,6 @@ struct swap_info_struct {
struct file *swap_file; /* seldom referenced */
unsigned int old_block_size; /* seldom referenced */
struct completion comp; /* seldom referenced */
-#ifdef CONFIG_FRONTSWAP
- unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
- atomic_t frontswap_pages; /* frontswap pages in-use counter */
-#endif
spinlock_t lock; /*
* protect map scan related fields like
* swap_map, lowest_bit, highest_bit,
@@ -337,15 +333,13 @@ struct swap_info_struct {
*/
};
-static inline swp_entry_t folio_swap_entry(struct folio *folio)
+static inline swp_entry_t page_swap_entry(struct page *page)
{
- swp_entry_t entry = { .val = page_private(&folio->page) };
- return entry;
-}
+ struct folio *folio = page_folio(page);
+ swp_entry_t entry = folio->swap;
-static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry)
-{
- folio->private = (void *)entry.val;
+ entry.val += folio_page_idx(folio, page);
+ return entry;
}
/* linux/mm/workingset.c */
@@ -630,11 +624,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
}
#endif
-#ifdef CONFIG_ZSWAP
-extern u64 zswap_pool_total_size;
-extern atomic_t zswap_stored_pages;
-#endif
-
#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index 7ed529a77c5b..99e3ed469e88 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -2,11 +2,6 @@
#ifndef _LINUX_SWAPFILE_H
#define _LINUX_SWAPFILE_H
-/*
- * these were static in swapfile.c but frontswap.c needs them and we don't
- * want to expose them to the dozens of source files that include swap.h
- */
-extern struct swap_info_struct *swap_info[];
extern unsigned long generic_max_swapfile_size(void);
unsigned long arch_max_swapfile_size(void);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 4c932cb45e0b..bff1e8d97de0 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -393,7 +393,12 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry)
typedef unsigned long pte_marker;
#define PTE_MARKER_UFFD_WP BIT(0)
-#define PTE_MARKER_SWAPIN_ERROR BIT(1)
+/*
+ * "Poisoned" here is meant in the very general sense of "future accesses are
+ * invalid", instead of referring very specifically to hardware memory errors.
+ * This marker is meant to represent any of various different causes of this.
+ */
+#define PTE_MARKER_POISONED BIT(1)
#define PTE_MARKER_MASK (BIT(2) - 1)
static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
@@ -421,15 +426,15 @@ static inline pte_t make_pte_marker(pte_marker marker)
return swp_entry_to_pte(make_pte_marker_entry(marker));
}
-static inline swp_entry_t make_swapin_error_entry(void)
+static inline swp_entry_t make_poisoned_swp_entry(void)
{
- return make_pte_marker_entry(PTE_MARKER_SWAPIN_ERROR);
+ return make_pte_marker_entry(PTE_MARKER_POISONED);
}
-static inline int is_swapin_error_entry(swp_entry_t entry)
+static inline int is_poisoned_swp_entry(swp_entry_t entry)
{
return is_pte_marker_entry(entry) &&
- (pte_marker_get(entry) & PTE_MARKER_SWAPIN_ERROR);
+ (pte_marker_get(entry) & PTE_MARKER_POISONED);
}
/*
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ac7b0c96d351..ac8c6854097c 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -46,6 +46,7 @@ enum mfill_atomic_mode {
MFILL_ATOMIC_COPY,
MFILL_ATOMIC_ZEROPAGE,
MFILL_ATOMIC_CONTINUE,
+ MFILL_ATOMIC_POISON,
NR_MFILL_ATOMIC_MODES,
};
@@ -83,6 +84,9 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long len, atomic_t *mmap_changing,
uffd_flags_t flags);
+extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
new file mode 100644
index 000000000000..2a60ce39cfde
--- /dev/null
+++ b/include/linux/zswap.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_ZSWAP_H
+#define _LINUX_ZSWAP_H
+
+#include <linux/types.h>
+#include <linux/mm_types.h>
+
+extern u64 zswap_pool_total_size;
+extern atomic_t zswap_stored_pages;
+
+#ifdef CONFIG_ZSWAP
+
+bool zswap_store(struct folio *folio);
+bool zswap_load(struct folio *folio);
+void zswap_invalidate(int type, pgoff_t offset);
+void zswap_swapon(int type);
+void zswap_swapoff(int type);
+
+#else
+
+static inline bool zswap_store(struct folio *folio)
+{
+ return false;
+}
+
+static inline bool zswap_load(struct folio *folio)
+{
+ return false;
+}
+
+static inline void zswap_invalidate(int type, pgoff_t offset) {}
+static inline void zswap_swapon(int type) {}
+static inline void zswap_swapoff(int type) {}
+
+#endif
+
+#endif /* _LINUX_ZSWAP_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 07b21d9a9620..91688d0dadcd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -45,7 +45,6 @@
#include <linux/memcontrol.h>
#include <linux/bpf-cgroup.h>
#include <linux/siphash.h>
-#include <linux/net_mm.h>
extern struct inet_hashinfo tcp_hashinfo;
diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
index 202b3e3e67ff..f50048af5fcc 100644
--- a/include/trace/events/thp.h
+++ b/include/trace/events/thp.h
@@ -8,25 +8,34 @@
#include <linux/types.h>
#include <linux/tracepoint.h>
-TRACE_EVENT(hugepage_set_pmd,
+DECLARE_EVENT_CLASS(hugepage_set,
- TP_PROTO(unsigned long addr, unsigned long pmd),
- TP_ARGS(addr, pmd),
+ TP_PROTO(unsigned long addr, unsigned long pte),
+ TP_ARGS(addr, pte),
TP_STRUCT__entry(
__field(unsigned long, addr)
- __field(unsigned long, pmd)
+ __field(unsigned long, pte)
),
TP_fast_assign(
__entry->addr = addr;
- __entry->pmd = pmd;
+ __entry->pte = pte;
),
- TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd)
+ TP_printk("Set page table entry with 0x%lx with 0x%lx", __entry->addr, __entry->pte)
);
+DEFINE_EVENT(hugepage_set, hugepage_set_pmd,
+ TP_PROTO(unsigned long addr, unsigned long pmd),
+ TP_ARGS(addr, pmd)
+);
-TRACE_EVENT(hugepage_update,
+DEFINE_EVENT(hugepage_set, hugepage_set_pud,
+ TP_PROTO(unsigned long addr, unsigned long pud),
+ TP_ARGS(addr, pud)
+);
+
+DECLARE_EVENT_CLASS(hugepage_update,
TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, unsigned long set),
TP_ARGS(addr, pte, clr, set),
@@ -48,6 +57,16 @@ TRACE_EVENT(hugepage_update,
TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set)
);
+DEFINE_EVENT(hugepage_update, hugepage_update_pmd,
+ TP_PROTO(unsigned long addr, unsigned long pmd, unsigned long clr, unsigned long set),
+ TP_ARGS(addr, pmd, clr, set)
+);
+
+DEFINE_EVENT(hugepage_update, hugepage_update_pud,
+ TP_PROTO(unsigned long addr, unsigned long pud, unsigned long clr, unsigned long set),
+ TP_ARGS(addr, pud, clr, set)
+);
+
DECLARE_EVENT_CLASS(migration_pmd,
TP_PROTO(unsigned long addr, unsigned long pmd),
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 66dd4cd277bd..62151706c5a3 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -39,7 +39,8 @@
UFFD_FEATURE_MINOR_SHMEM | \
UFFD_FEATURE_EXACT_ADDRESS | \
UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \
- UFFD_FEATURE_WP_UNPOPULATED)
+ UFFD_FEATURE_WP_UNPOPULATED | \
+ UFFD_FEATURE_POISON)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -49,12 +50,14 @@
(__u64)1 << _UFFDIO_COPY | \
(__u64)1 << _UFFDIO_ZEROPAGE | \
(__u64)1 << _UFFDIO_WRITEPROTECT | \
- (__u64)1 << _UFFDIO_CONTINUE)
+ (__u64)1 << _UFFDIO_CONTINUE | \
+ (__u64)1 << _UFFDIO_POISON)
#define UFFD_API_RANGE_IOCTLS_BASIC \
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_WRITEPROTECT | \
(__u64)1 << _UFFDIO_CONTINUE | \
- (__u64)1 << _UFFDIO_WRITEPROTECT)
+ (__u64)1 << _UFFDIO_POISON)
/*
* Valid ioctl command number range with this API is from 0x00 to
@@ -71,6 +74,7 @@
#define _UFFDIO_ZEROPAGE (0x04)
#define _UFFDIO_WRITEPROTECT (0x06)
#define _UFFDIO_CONTINUE (0x07)
+#define _UFFDIO_POISON (0x08)
#define _UFFDIO_API (0x3F)
/* userfaultfd ioctl ids */
@@ -91,6 +95,8 @@
struct uffdio_writeprotect)
#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \
struct uffdio_continue)
+#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \
+ struct uffdio_poison)
/* read() structure */
struct uffd_msg {
@@ -225,6 +231,7 @@ struct uffdio_api {
#define UFFD_FEATURE_EXACT_ADDRESS (1<<11)
#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12)
#define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
+#define UFFD_FEATURE_POISON (1<<14)
__u64 features;
__u64 ioctls;
@@ -321,6 +328,18 @@ struct uffdio_continue {
__s64 mapped;
};
+struct uffdio_poison {
+ struct uffdio_range range;
+#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * Fields below here are written by the ioctl and must be at the end:
+ * the copy_from_user will not read past here.
+ */
+ __s64 updated;
+};
+
/*
* Flags for the userfaultfd(2) system call itself.
*/