summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-27 19:42:02 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-27 19:42:02 -0700
commit7fa8a8ee9400fe8ec188426e40e481717bc5e924 (patch)
treecc8fd6b4f936ec01e73238643757451e20478c07 /include/linux
parent91ec4b0d11fe115581ce2835300558802ce55e6c (diff)
parent4d4b6d66db63ceed399f1fb1a4b24081d2590eb1 (diff)
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of switching from a user process to a kernel thread. - More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav. - zsmalloc performance improvements from Sergey Senozhatsky. - Yue Zhao has found and fixed some data race issues around the alteration of memcg userspace tunables. - VFS rationalizations from Christoph Hellwig: - removal of most of the callers of write_one_page() - make __filemap_get_folio()'s return value more useful - Luis Chamberlain has changed tmpfs so it no longer requires swap backing. Use `mount -o noswap'. - Qi Zheng has made the slab shrinkers operate locklessly, providing some scalability benefits. - Keith Busch has improved dmapool's performance, making part of its operations O(1) rather than O(n). - Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd, permitting userspace to wr-protect anon memory unpopulated ptes. - Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather than exclusive, and has fixed a bunch of errors which were caused by its unintuitive meaning. - Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature, which causes minor faults to install a write-protected pte. - Vlastimil Babka has done some maintenance work on vma_merge(): cleanups to the kernel code and improvements to our userspace test harness. - Cleanups to do_fault_around() by Lorenzo Stoakes. - Mike Rapoport has moved a lot of initialization code out of various mm/ files and into mm/mm_init.c. - Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for DRM, but DRM doesn't use it any more. - Lorenzo has also coverted read_kcore() and vread() to use iterators and has thereby removed the use of bounce buffers in some cases. - Lorenzo has also contributed further cleanups of vma_merge(). - Chaitanya Prakash provides some fixes to the mmap selftesting code. - Matthew Wilcox changes xfs and afs so they no longer take sleeping locks in ->map_page(), a step towards RCUification of pagefaults. - Suren Baghdasaryan has improved mmap_lock scalability by switching to per-VMA locking. - Frederic Weisbecker has reworked the percpu cache draining so that it no longer causes latency glitches on cpu isolated workloads. - Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig logic. - Liu Shixin has changed zswap's initialization so we no longer waste a chunk of memory if zswap is not being used. - Yosry Ahmed has improved the performance of memcg statistics flushing. - David Stevens has fixed several issues involving khugepaged, userfaultfd and shmem. - Christoph Hellwig has provided some cleanup work to zram's IO-related code paths. - David Hildenbrand has fixed up some issues in the selftest code's testing of our pte state changing. - Pankaj Raghav has made page_endio() unneeded and has removed it. - Peter Xu contributed some rationalizations of the userfaultfd selftests. - Yosry Ahmed has fixed an issue around memcg's page recalim accounting. - Chaitanya Prakash has fixed some arm-related issues in the selftests/mm code. - Longlong Xia has improved the way in which KSM handles hwpoisoned pages. - Peter Xu fixes a few issues with uffd-wp at fork() time. - Stefan Roesch has changed KSM so that it may now be used on a per-process and per-cgroup basis. * tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits) mm,unmap: avoid flushing TLB in batch if PTE is inaccessible shmem: restrict noswap option to initial user namespace mm/khugepaged: fix conflicting mods to collapse_file() sparse: remove unnecessary 0 values from rc mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area() hugetlb: pte_alloc_huge() to replace huge pte_alloc_map() maple_tree: fix allocation in mas_sparse_area() mm: do not increment pgfault stats when page fault handler retries zsmalloc: allow only one active pool compaction context selftests/mm: add new selftests for KSM mm: add new KSM process and sysfs knobs mm: add new api to enable ksm per process mm: shrinkers: fix debugfs file permissions mm: don't check VMA write permissions if the PTE/PMD indicates write permissions migrate_pages_batch: fix statistics for longterm pin retry userfaultfd: use helper function range_in_vma() lib/show_mem.c: use for_each_populated_zone() simplify code mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list() fs/buffer: convert create_page_buffers to folio_create_buffers fs/buffer: add folio_create_empty_buffers helper ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/buffer_head.h6
-rw-r--r--include/linux/cgroup.h2
-rw-r--r--include/linux/gfp.h7
-rw-r--r--include/linux/gfp_types.h30
-rw-r--r--include/linux/highmem.h62
-rw-r--r--include/linux/huge_mm.h41
-rw-r--r--include/linux/hugetlb.h46
-rw-r--r--include/linux/io-mapping.h20
-rw-r--r--include/linux/kmsan.h43
-rw-r--r--include/linux/ksm.h37
-rw-r--r--include/linux/memblock.h2
-rw-r--r--include/linux/memcontrol.h10
-rw-r--r--include/linux/memfd.h4
-rw-r--r--include/linux/mm.h206
-rw-r--r--include/linux/mm_inline.h6
-rw-r--r--include/linux/mm_types.h46
-rw-r--r--include/linux/mmap_lock.h37
-rw-r--r--include/linux/mmzone.h34
-rw-r--r--include/linux/page-flags.h23
-rw-r--r--include/linux/page_ext.h2
-rw-r--r--include/linux/pageblock-flags.h4
-rw-r--r--include/linux/pagemap.h15
-rw-r--r--include/linux/pgtable.h9
-rw-r--r--include/linux/sched/coredump.h1
-rw-r--r--include/linux/sched/isolation.h12
-rw-r--r--include/linux/sched/mm.h28
-rw-r--r--include/linux/shmem_fs.h19
-rw-r--r--include/linux/slab.h5
-rw-r--r--include/linux/swap.h38
-rw-r--r--include/linux/uio.h2
-rw-r--r--include/linux/userfaultfd_k.h92
-rw-r--r--include/linux/vm_event_item.h6
-rw-r--r--include/linux/vmalloc.h7
-rw-r--r--include/linux/vmstat.h6
34 files changed, 626 insertions, 282 deletions
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 8f14dca5fed7..1520793c72da 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -196,11 +196,17 @@ void mark_buffer_write_io_error(struct buffer_head *bh);
void touch_buffer(struct buffer_head *bh);
void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset);
+void folio_set_bh(struct buffer_head *bh, struct folio *folio,
+ unsigned long offset);
bool try_to_free_buffers(struct folio *);
+struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
+ bool retry);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
bool retry);
void create_empty_buffers(struct page *, unsigned long,
unsigned long b_state);
+void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize,
+ unsigned long b_state);
void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
void end_buffer_async_write(struct buffer_head *bh, int uptodate);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3410aecffdb4..885f5395fcd0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -692,7 +692,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
*/
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
-void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp);
+void cgroup_rstat_flush_atomic(struct cgroup *cgrp);
void cgroup_rstat_flush_hold(struct cgroup *cgrp);
void cgroup_rstat_flush_release(void);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 65a78773dcca..ed8cb537c6a7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -319,7 +319,7 @@ extern void page_frag_free(void *addr);
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)
-void page_alloc_init(void);
+void page_alloc_init_cpuhp(void);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);
@@ -361,9 +361,4 @@ extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
#endif
void free_contig_range(unsigned long pfn, unsigned long nr_pages);
-#ifdef CONFIG_CMA
-/* CMA stuff */
-extern void init_cma_reserved_pageblock(struct page *page);
-#endif
-
#endif /* __LINUX_GFP_H */
diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 5088637fe5c2..6583a58670c5 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -47,16 +47,14 @@ typedef unsigned int __bitwise gfp_t;
#define ___GFP_ACCOUNT 0x400000u
#define ___GFP_ZEROTAGS 0x800000u
#ifdef CONFIG_KASAN_HW_TAGS
-#define ___GFP_SKIP_ZERO 0x1000000u
-#define ___GFP_SKIP_KASAN_UNPOISON 0x2000000u
-#define ___GFP_SKIP_KASAN_POISON 0x4000000u
+#define ___GFP_SKIP_ZERO 0x1000000u
+#define ___GFP_SKIP_KASAN 0x2000000u
#else
-#define ___GFP_SKIP_ZERO 0
-#define ___GFP_SKIP_KASAN_UNPOISON 0
-#define ___GFP_SKIP_KASAN_POISON 0
+#define ___GFP_SKIP_ZERO 0
+#define ___GFP_SKIP_KASAN 0
#endif
#ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP 0x8000000u
+#define ___GFP_NOLOCKDEP 0x4000000u
#else
#define ___GFP_NOLOCKDEP 0
#endif
@@ -234,25 +232,24 @@ typedef unsigned int __bitwise gfp_t;
* memory tags at the same time as zeroing memory has minimal additional
* performace impact.
*
- * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation.
- * Only effective in HW_TAGS mode.
- *
- * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation.
- * Typically, used for userspace pages. Only effective in HW_TAGS mode.
+ * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation.
+ * Used for userspace and vmalloc pages; the latter are unpoisoned by
+ * kasan_unpoison_vmalloc instead. For userspace pages, results in
+ * poisoning being skipped as well, see should_skip_kasan_poison for
+ * details. Only effective in HW_TAGS mode.
*/
#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN)
#define __GFP_COMP ((__force gfp_t)___GFP_COMP)
#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO)
#define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS)
#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO)
-#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON)
-#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON)
+#define __GFP_SKIP_KASAN ((__force gfp_t)___GFP_SKIP_KASAN)
/* Disable lockdep for GFP context tracking */
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
/* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP))
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
/**
@@ -335,8 +332,7 @@ typedef unsigned int __bitwise gfp_t;
#define GFP_DMA __GFP_DMA
#define GFP_DMA32 __GFP_DMA32
#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
-#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE | \
- __GFP_SKIP_KASAN_POISON | __GFP_SKIP_KASAN_UNPOISON)
+#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE | __GFP_SKIP_KASAN)
#define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
__GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
#define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 8fc10089e19e..4de1dbcd3ef6 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -243,12 +243,10 @@ static inline void clear_highpage(struct page *page)
static inline void clear_highpage_kasan_tagged(struct page *page)
{
- u8 tag;
+ void *kaddr = kmap_local_page(page);
- tag = page_kasan_tag(page);
- page_kasan_tag_reset(page);
- clear_highpage(page);
- page_kasan_tag_set(page, tag);
+ clear_page(kasan_reset_tag(kaddr));
+ kunmap_local(kaddr);
}
#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE
@@ -317,7 +315,29 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
#endif
+#ifndef __HAVE_ARCH_COPY_HIGHPAGE
+
+static inline void copy_highpage(struct page *to, struct page *from)
+{
+ char *vfrom, *vto;
+
+ vfrom = kmap_local_page(from);
+ vto = kmap_local_page(to);
+ copy_page(vto, vfrom);
+ kmsan_copy_page_meta(to, from);
+ kunmap_local(vto);
+ kunmap_local(vfrom);
+}
+
+#endif
+
#ifdef copy_mc_to_kernel
+/*
+ * If architecture supports machine check exception handling, define the
+ * #MC versions of copy_user_highpage and copy_highpage. They copy a memory
+ * page with #MC in source page (@from) handled, and return the number
+ * of bytes not copied if there was a #MC, otherwise 0 for success.
+ */
static inline int copy_mc_user_highpage(struct page *to, struct page *from,
unsigned long vaddr, struct vm_area_struct *vma)
{
@@ -334,29 +354,35 @@ static inline int copy_mc_user_highpage(struct page *to, struct page *from,
return ret;
}
-#else
-static inline int copy_mc_user_highpage(struct page *to, struct page *from,
- unsigned long vaddr, struct vm_area_struct *vma)
-{
- copy_user_highpage(to, from, vaddr, vma);
- return 0;
-}
-#endif
-
-#ifndef __HAVE_ARCH_COPY_HIGHPAGE
-static inline void copy_highpage(struct page *to, struct page *from)
+static inline int copy_mc_highpage(struct page *to, struct page *from)
{
+ unsigned long ret;
char *vfrom, *vto;
vfrom = kmap_local_page(from);
vto = kmap_local_page(to);
- copy_page(vto, vfrom);
- kmsan_copy_page_meta(to, from);
+ ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
+ if (!ret)
+ kmsan_copy_page_meta(to, from);
kunmap_local(vto);
kunmap_local(vfrom);
+
+ return ret;
+}
+#else
+static inline int copy_mc_user_highpage(struct page *to, struct page *from,
+ unsigned long vaddr, struct vm_area_struct *vma)
+{
+ copy_user_highpage(to, from, vaddr, vma);
+ return 0;
}
+static inline int copy_mc_highpage(struct page *to, struct page *from)
+{
+ copy_highpage(to, from);
+ return 0;
+}
#endif
static inline void memcpy_page(struct page *dst_page, size_t dst_off,
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 70bd867eba94..20284387b841 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,47 +39,12 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr, pgprot_t newprot,
unsigned long cp_flags);
-vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
- pgprot_t pgprot, bool write);
-/**
- * vmf_insert_pfn_pmd - insert a pmd size pfn
- * @vmf: Structure describing the fault
- * @pfn: pfn to insert
- * @pgprot: page protection to use
- * @write: whether it's a write fault
- *
- * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
- *
- * Return: vm_fault_t value.
- */
-static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn,
- bool write)
-{
- return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write);
-}
-vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
- pgprot_t pgprot, bool write);
-
-/**
- * vmf_insert_pfn_pud - insert a pud size pfn
- * @vmf: Structure describing the fault
- * @pfn: pfn to insert
- * @pgprot: page protection to use
- * @write: whether it's a write fault
- *
- * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
- *
- * Return: vm_fault_t value.
- */
-static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn,
- bool write)
-{
- return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write);
-}
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
enum transparent_hugepage_flag {
- TRANSPARENT_HUGEPAGE_NEVER_DAX,
+ TRANSPARENT_HUGEPAGE_UNSUPPORTED,
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4056b05d81ed..6d041aa9f0fe 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -150,13 +150,12 @@ unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
#ifdef CONFIG_USERFAULTFD
-int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy);
+int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ uffd_flags_t flags,
+ struct folio **foliop);
#endif /* CONFIG_USERFAULTFD */
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
@@ -184,6 +183,23 @@ extern struct list_head huge_boot_pages;
/* arch callbacks */
+#ifndef CONFIG_HIGHPTE
+/*
+ * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures
+ * which may go down to the lowest PTE level in their huge_pte_offset() and
+ * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap().
+ */
+static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address)
+{
+ return pte_offset_kernel(pmd, address);
+}
+static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
+{
+ return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address);
+}
+#endif
+
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz);
/*
@@ -385,14 +401,12 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
}
#ifdef CONFIG_USERFAULTFD
-static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pte_t *dst_pte,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy)
+static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ uffd_flags_t flags,
+ struct folio **foliop)
{
BUG();
return 0;
@@ -810,7 +824,7 @@ static inline unsigned huge_page_shift(struct hstate *h)
static inline bool hstate_is_gigantic(struct hstate *h)
{
- return huge_page_order(h) >= MAX_ORDER;
+ return huge_page_order(h) > MAX_ORDER;
}
static inline unsigned int pages_per_huge_page(const struct hstate *h)
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 09d4f17c8d3b..7376c1df9c90 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -69,7 +69,10 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping,
BUG_ON(offset >= mapping->size);
phys_addr = mapping->base + offset;
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ else
+ migrate_disable();
pagefault_disable();
return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
}
@@ -79,7 +82,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
{
kunmap_local_indexed((void __force *)vaddr);
pagefault_enable();
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ else
+ migrate_enable();
}
static inline void __iomem *
@@ -162,7 +168,10 @@ static inline void __iomem *
io_mapping_map_atomic_wc(struct io_mapping *mapping,
unsigned long offset)
{
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+ else
+ migrate_disable();
pagefault_disable();
return io_mapping_map_wc(mapping, offset, PAGE_SIZE);
}
@@ -172,7 +181,10 @@ io_mapping_unmap_atomic(void __iomem *vaddr)
{
io_mapping_unmap(vaddr);
pagefault_enable();
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+ else
+ migrate_enable();
}
static inline void __iomem *
diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h
index 30b17647ce3c..e0c23a32cdf0 100644
--- a/include/linux/kmsan.h
+++ b/include/linux/kmsan.h
@@ -54,7 +54,8 @@ void __init kmsan_init_runtime(void);
* Freed pages are either returned to buddy allocator or held back to be used
* as metadata pages.
*/
-bool __init kmsan_memblock_free_pages(struct page *page, unsigned int order);
+bool __init __must_check kmsan_memblock_free_pages(struct page *page,
+ unsigned int order);
/**
* kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call.
@@ -137,9 +138,11 @@ void kmsan_kfree_large(const void *ptr);
* vmalloc metadata address range. Returns 0 on success, callers must check
* for non-zero return value.
*/
-int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages,
- unsigned int page_shift);
+int __must_check kmsan_vmap_pages_range_noflush(unsigned long start,
+ unsigned long end,
+ pgprot_t prot,
+ struct page **pages,
+ unsigned int page_shift);
/**
* kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
@@ -163,9 +166,9 @@ void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end);
* virtual memory. Returns 0 on success, callers must check for non-zero return
* value.
*/
-int kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int page_shift);
+int __must_check kmsan_ioremap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int page_shift);
/**
* kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call.
@@ -237,8 +240,8 @@ static inline void kmsan_init_runtime(void)
{
}
-static inline bool kmsan_memblock_free_pages(struct page *page,
- unsigned int order)
+static inline bool __must_check kmsan_memblock_free_pages(struct page *page,
+ unsigned int order)
{
return true;
}
@@ -251,10 +254,9 @@ static inline void kmsan_task_exit(struct task_struct *task)
{
}
-static inline int kmsan_alloc_page(struct page *page, unsigned int order,
- gfp_t flags)
+static inline void kmsan_alloc_page(struct page *page, unsigned int order,
+ gfp_t flags)
{
- return 0;
}
static inline void kmsan_free_page(struct page *page, unsigned int order)
@@ -283,11 +285,9 @@ static inline void kmsan_kfree_large(const void *ptr)
{
}
-static inline int kmsan_vmap_pages_range_noflush(unsigned long start,
- unsigned long end,
- pgprot_t prot,
- struct page **pages,
- unsigned int page_shift)
+static inline int __must_check kmsan_vmap_pages_range_noflush(
+ unsigned long start, unsigned long end, pgprot_t prot,
+ struct page **pages, unsigned int page_shift)
{
return 0;
}
@@ -297,10 +297,11 @@ static inline void kmsan_vunmap_range_noflush(unsigned long start,
{
}
-static inline int kmsan_ioremap_page_range(unsigned long start,
- unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int page_shift)
+static inline int __must_check kmsan_ioremap_page_range(unsigned long start,
+ unsigned long end,
+ phys_addr_t phys_addr,
+ pgprot_t prot,
+ unsigned int page_shift)
{
return 0;
}
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7e232ba59b86..7a9b76fb6c3f 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -18,13 +18,26 @@
#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags);
+
+void ksm_add_vma(struct vm_area_struct *vma);
+int ksm_enable_merge_any(struct mm_struct *mm);
+
int __ksm_enter(struct mm_struct *mm);
void __ksm_exit(struct mm_struct *mm);
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
- return __ksm_enter(mm);
+ int ret;
+
+ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) {
+ ret = __ksm_enter(mm);
+ if (ret)
+ return ret;
+ }
+
+ if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags))
+ set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+
return 0;
}
@@ -51,8 +64,21 @@ struct page *ksm_might_need_to_copy(struct page *page,
void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
+#ifdef CONFIG_MEMORY_FAILURE
+void collect_procs_ksm(struct page *page, struct list_head *to_kill,
+ int force_early);
+#endif
+
+#ifdef CONFIG_PROC_FS
+long ksm_process_profit(struct mm_struct *);
+#endif /* CONFIG_PROC_FS */
+
#else /* !CONFIG_KSM */
+static inline void ksm_add_vma(struct vm_area_struct *vma)
+{
+}
+
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
return 0;
@@ -62,6 +88,13 @@ static inline void ksm_exit(struct mm_struct *mm)
{
}
+#ifdef CONFIG_MEMORY_FAILURE
+static inline void collect_procs_ksm(struct page *page,
+ struct list_head *to_kill, int force_early)
+{
+}
+#endif
+
#ifdef CONFIG_MMU
static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 50ad19662a32..f82ee3fac1cd 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -597,6 +597,8 @@ extern int hashdist; /* Distribute hashes across NUMA nodes? */
#endif
#ifdef CONFIG_MEMTEST
+extern phys_addr_t early_memtest_bad_size; /* Size of faulty ram found by memtest */
+extern bool early_memtest_done; /* Was early memtest done? */
extern void early_memtest(phys_addr_t start, phys_addr_t end);
#else
static inline void early_memtest(phys_addr_t start, phys_addr_t end)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6eda2ab205d..222d7370134c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -97,6 +97,7 @@ struct shrinker_info {
struct rcu_head rcu;
atomic_long_t *nr_deferred;
unsigned long *map;
+ int map_nr_max;
};
struct lruvec_stats_percpu {
@@ -1037,7 +1038,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
}
void mem_cgroup_flush_stats(void);
-void mem_cgroup_flush_stats_delayed(void);
+void mem_cgroup_flush_stats_atomic(void);
+void mem_cgroup_flush_stats_ratelimited(void);
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val);
@@ -1535,7 +1537,11 @@ static inline void mem_cgroup_flush_stats(void)
{
}
-static inline void mem_cgroup_flush_stats_delayed(void)
+static inline void mem_cgroup_flush_stats_atomic(void)
+{
+}
+
+static inline void mem_cgroup_flush_stats_ratelimited(void)
{
}
diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index 4f1600413f91..e7abf6fa4c52 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -5,9 +5,9 @@
#include <linux/file.h>
#ifdef CONFIG_MEMFD_CREATE
-extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
+extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg);
#else
-static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
+static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a)
{
return -EINVAL;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 98da268b834a..3731999cd9f0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -29,6 +29,7 @@
#include <linux/pgtable.h>
#include <linux/kasan.h>
#include <linux/memremap.h>
+#include <linux/slab.h>
struct mempolicy;
struct anon_vma;
@@ -38,6 +39,7 @@ struct pt_regs;
extern int sysctl_page_lock_unfairness;
+void mm_core_init(void);
void init_mm_internals(void);
#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */
@@ -256,6 +258,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);
+/* Use only if VMA has no other users */
+void __vm_area_free(struct vm_area_struct *vma);
#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
@@ -478,7 +482,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{ FAULT_FLAG_USER, "USER" }, \
{ FAULT_FLAG_REMOTE, "REMOTE" }, \
{ FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
- { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }
+ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \
+ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" }
/*
* vm_fault is filled by the pagefault handler and passed to the vma's
@@ -623,6 +628,131 @@ struct vm_operations_struct {
unsigned long addr);
};
+#ifdef CONFIG_NUMA_BALANCING
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+ vma->numab_state = NULL;
+}
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
+{
+ kfree(vma->numab_state);
+}
+#else
+static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
+static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * Try to read-lock a vma. The function is allowed to occasionally yield false
+ * locked result to avoid performance overhead, in which case we fall back to
+ * using mmap_lock. The function should never yield false unlocked result.
+ */
+static inline bool vma_start_read(struct vm_area_struct *vma)
+{
+ /* Check before locking. A race might cause false locked result. */
+ if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
+ return false;
+
+ if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+ return false;
+
+ /*
+ * Overflow might produce false locked result.
+ * False unlocked result is impossible because we modify and check
+ * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
+ * modification invalidates all existing locks.
+ */
+ if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
+ up_read(&vma->vm_lock->lock);
+ return false;
+ }
+ return true;
+}
+
+static inline void vma_end_read(struct vm_area_struct *vma)
+{
+ rcu_read_lock(); /* keeps vma alive till the end of up_read */
+ up_read(&vma->vm_lock->lock);
+ rcu_read_unlock();
+}
+
+static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+
+ /*
+ * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+ * mm->mm_lock_seq can't be concurrently modified.
+ */
+ *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+ return (vma->vm_lock_seq == *mm_lock_seq);
+}
+
+static inline void vma_start_write(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ if (__is_vma_write_locked(vma, &mm_lock_seq))
+ return;
+
+ down_write(&vma->vm_lock->lock);
+ vma->vm_lock_seq = mm_lock_seq;
+ up_write(&vma->vm_lock->lock);
+}
+
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ if (__is_vma_write_locked(vma, &mm_lock_seq))
+ return true;
+
+ if (!down_write_trylock(&vma->vm_lock->lock))
+ return false;
+
+ vma->vm_lock_seq = mm_lock_seq;
+ up_write(&vma->vm_lock->lock);
+ return true;
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
+}
+
+static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
+{
+ /* When detaching vma should be write-locked */
+ if (detached)
+ vma_assert_write_locked(vma);
+ vma->detached = detached;
+}
+
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address);
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline bool vma_start_read(struct vm_area_struct *vma)
+ { return false; }
+static inline void vma_end_read(struct vm_area_struct *vma) {}
+static inline void vma_start_write(struct vm_area_struct *vma) {}
+static inline bool vma_try_start_write(struct vm_area_struct *vma)
+ { return true; }
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+static inline void vma_mark_detached(struct vm_area_struct *vma,
+ bool detached) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+/*
+ * WARNING: vma_init does not initialize vma->vm_lock.
+ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
+ */
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};
@@ -631,6 +761,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
+ vma_mark_detached(vma, false);
+ vma_numab_state_init(vma);
}
/* Use when VMA is not part of the VMA tree and needs no locking */
@@ -644,28 +776,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
static inline void vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
vm_flags_init(vma, flags);
}
static inline void vm_flags_reset_once(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
}
static inline void vm_flags_set(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
ACCESS_PRIVATE(vma, __vm_flags) |= flags;
}
static inline void vm_flags_clear(struct vm_area_struct *vma,
vm_flags_t flags)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
}
@@ -686,7 +818,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma,
static inline void vm_flags_mod(struct vm_area_struct *vma,
vm_flags_t set, vm_flags_t clear)
{
- mmap_assert_write_locked(vma->vm_mm);
+ vma_start_write(vma);
__vm_flags_mod(vma, set, clear);
}
@@ -1554,6 +1686,16 @@ static inline int xchg_page_access_time(struct page *page, int time)
last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS);
return last_time << PAGE_ACCESS_TIME_BUCKETS;
}
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+ unsigned int pid_bit;
+
+ pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+ if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) {
+ __set_bit(pid_bit, &vma->numab_state->access_pids[1]);
+ }
+}
#else /* !CONFIG_NUMA_BALANCING */
static inline int page_cpupid_xchg_last(struct page *page, int cpupid)
{
@@ -1603,6 +1745,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
return false;
}
+
+static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -2636,12 +2782,6 @@ static inline bool ptlock_init(struct page *page) { return true; }
static inline void ptlock_free(struct page *page) {}
#endif /* USE_SPLIT_PTE_PTLOCKS */
-static inline void pgtable_init(void)
-{
- ptlock_cache_init();
- pgtable_cache_init();
-}
-
static inline bool pgtable_pte_page_ctor(struct page *page)
{
if (!ptlock_init(page))
@@ -2785,7 +2925,6 @@ extern unsigned long free_reserved_area(void *start, void *end,
int poison, const char *s);
extern void adjust_managed_page_count(struct page *page, long count);
-extern void mem_init_print_info(void);
extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
@@ -2896,7 +3035,6 @@ extern void setup_per_cpu_pageset(void);
extern int min_free_kbytes;
extern int watermark_boost_factor;
extern int watermark_scale_factor;
-extern bool arch_has_descending_max_zone_pfns(void);
/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
@@ -3185,8 +3323,6 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn);
-vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
@@ -3256,7 +3392,6 @@ extern int apply_to_existing_page_range(struct mm_struct *mm,
unsigned long address, unsigned long size,
pte_fn_t fn, void *data);
-extern void __init init_mem_debugging_and_hardening(void);
#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
@@ -3425,6 +3560,22 @@ void vmemmap_populate_print_last(void);
void vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap);
#endif
+
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
+{
+ return is_power_of_2(sizeof(struct page)) &&
+ pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+}
+#else
+static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
+{
+ return false;
+}
+#endif
+
void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
unsigned long nr_pages);
@@ -3451,6 +3602,7 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
+struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
#else
static inline void memory_failure_queue(unsigned long pfn, int flags)
{
@@ -3471,6 +3623,12 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
}
#endif
+#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM)
+void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
+ struct vm_area_struct *vma, struct list_head *to_kill,
+ unsigned long ksm_addr);
+#endif
+
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
extern void memblk_nr_poison_inc(unsigned long pfn);
extern void memblk_nr_poison_sub(unsigned long pfn, long i);
@@ -3540,14 +3698,12 @@ extern const struct attribute_group memory_failure_attr_group;
extern void clear_huge_page(struct page *page,
unsigned long addr_hint,
unsigned int pages_per_huge_page);
-extern void copy_user_huge_page(struct page *dst, struct page *src,
- unsigned long addr_hint,
- struct vm_area_struct *vma,
- unsigned int pages_per_huge_page);
-extern long copy_huge_page_from_user(struct page *dst_page,
- const void __user *usr_src,
- unsigned int pages_per_huge_page,
- bool allow_pagefault);
+int copy_user_large_folio(struct folio *dst, struct folio *src,
+ unsigned long addr_hint,
+ struct vm_area_struct *vma);
+long copy_folio_from_user(struct folio *dst_folio,
+ const void __user *usr_src,
+ bool allow_pagefault);
/**
* vma_is_special_huge - Are transhuge page-table entries considered special?
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index de1e622dd366..0e1d239a882c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -557,6 +557,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
/* The current status of the pte should be "cleared" before calling */
WARN_ON_ONCE(!pte_none(*pte));
+ /*
+ * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
+ * thing, because when zapping either it means it's dropping the
+ * page, or in TTU where the present pte will be quickly replaced
+ * with a swap pte. There's no way of leaking the bit.
+ */
if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
return;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a57e6ae78e65..3fc9e680f174 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -471,6 +471,16 @@ struct anon_vma_name {
char name[];
};
+struct vma_lock {
+ struct rw_semaphore lock;
+};
+
+struct vma_numab_state {
+ unsigned long next_scan;
+ unsigned long next_pid_reset;
+ unsigned long access_pids[2];
+};
+
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -480,17 +490,19 @@ struct anon_vma_name {
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
- unsigned long vm_start; /* Our start address within vm_mm. */
- unsigned long vm_end; /* The first byte after our end address
- within vm_mm. */
+ union {
+ struct {
+ /* VMA covers [vm_start; vm_end) addresses within mm */
+ unsigned long vm_start;
+ unsigned long vm_end;
+ };
+#ifdef CONFIG_PER_VMA_LOCK
+ struct rcu_head vm_rcu; /* Used for deferred freeing. */
+#endif
+ };
struct mm_struct *vm_mm; /* The address space we belong to. */
-
- /*
- * Access permissions of this VMA.
- * See vmf_insert_mixed_prot() for discussion.
- */
- pgprot_t vm_page_prot;
+ pgprot_t vm_page_prot; /* Access permissions of this VMA. */
/*
* Flags, see mm.h.
@@ -501,6 +513,14 @@ struct vm_area_struct {
vm_flags_t __private __vm_flags;
};
+#ifdef CONFIG_PER_VMA_LOCK
+ int vm_lock_seq;
+ struct vma_lock *vm_lock;
+
+ /* Flag to indicate areas detached from the mm->mm_mt tree */
+ bool detached;
+#endif
+
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
@@ -547,6 +567,9 @@ struct vm_area_struct {
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ struct vma_numab_state *numab_state; /* NUMA Balancing state */
+#endif
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
@@ -637,6 +660,9 @@ struct mm_struct {
* init_mm.mmlist, and are protected
* by mmlist_lock
*/
+#ifdef CONFIG_PER_VMA_LOCK
+ int mm_lock_seq;
+#endif
unsigned long hiwater_rss; /* High-watermark of RSS usage */
@@ -1037,6 +1063,7 @@ typedef struct {
* mapped after the fault.
* @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
* We should only access orig_pte if this flag set.
+ * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
*
* About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
* whether we would allow page faults to retry by specifying these two
@@ -1074,6 +1101,7 @@ enum fault_flag {
FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
FAULT_FLAG_UNSHARE = 1 << 10,
FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
+ FAULT_FLAG_VMA_LOCK = 1 << 12,
};
typedef unsigned int __bitwise zap_flags_t;
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 96e113e23d04..aab8f1b28d26 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -60,6 +60,29 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
#endif /* CONFIG_TRACING */
+static inline void mmap_assert_locked(struct mm_struct *mm)
+{
+ lockdep_assert_held(&mm->mmap_lock);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+}
+
+static inline void mmap_assert_write_locked(struct mm_struct *mm)
+{
+ lockdep_assert_held_write(&mm->mmap_lock);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_end_write_all(struct mm_struct *mm)
+{
+ mmap_assert_write_locked(mm);
+ /* No races during update due to exclusive mmap_lock being held */
+ WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_end_write_all(struct mm_struct *mm) {}
+#endif
+
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
@@ -102,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
static inline void mmap_write_unlock(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, true);
+ vma_end_write_all(mm);
up_write(&mm->mmap_lock);
}
static inline void mmap_write_downgrade(struct mm_struct *mm)
{
__mmap_lock_trace_acquire_returned(mm, false, true);
+ vma_end_write_all(mm);
downgrade_write(&mm->mmap_lock);
}
@@ -150,18 +175,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
up_read_non_owner(&mm->mmap_lock);
}
-static inline void mmap_assert_locked(struct mm_struct *mm)
-{
- lockdep_assert_held(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
-}
-
-static inline void mmap_assert_write_locked(struct mm_struct *mm)
-{
- lockdep_assert_held_write(&mm->mmap_lock);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
-}
-
static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
return rwsem_is_contended(&mm->mmap_lock);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9fb1b03b83b2..a4889c9d4055 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -26,11 +26,13 @@
/* Free memory management - zoned buddy allocator. */
#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
-#define MAX_ORDER 11
+#define MAX_ORDER 10
#else
#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
-#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
+#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER)
+
+#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
/*
* PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
@@ -93,7 +95,7 @@ static inline bool migratetype_is_mergeable(int mt)
}
#define for_each_migratetype_order(order, type) \
- for (order = 0; order < MAX_ORDER; order++) \
+ for (order = 0; order <= MAX_ORDER; order++) \
for (type = 0; type < MIGRATE_TYPES; type++)
extern int page_group_by_mobility_disabled;
@@ -108,18 +110,6 @@ struct free_area {
unsigned long nr_free;
};
-static inline struct page *get_page_from_free_area(struct free_area *area,
- int migratetype)
-{
- return list_first_entry_or_null(&area->free_list[migratetype],
- struct page, lru);
-}
-
-static inline bool free_area_empty(struct free_area *area, int migratetype)
-{
- return list_empty(&area->free_list[migratetype]);
-}
-
struct pglist_data;
#ifdef CONFIG_NUMA
@@ -453,18 +443,14 @@ enum {
struct lru_gen_mm_state {
/* set to max_seq after each iteration */
unsigned long seq;
- /* where the current iteration continues (inclusive) */
+ /* where the current iteration continues after */
struct list_head *head;
- /* where the last iteration ended (exclusive) */
+ /* where the last iteration ended before */
struct list_head *tail;
- /* to wait for the last page table walker to finish */
- struct wait_queue_head wait;
/* Bloom filters flip after each iteration */
unsigned long *filters[NR_BLOOM_FILTERS];
/* the mm stats for debugging */
unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
- /* the number of concurrent page table walkers */
- int nr_walkers;
};
struct lru_gen_mm_walk {
@@ -922,7 +908,7 @@ struct zone {
CACHELINE_PADDING(_pad1_);
/* free areas of different sizes */
- struct free_area free_area[MAX_ORDER];
+ struct free_area free_area[MAX_ORDER + 1];
/* zone flags, see below */
unsigned long flags;
@@ -1369,7 +1355,7 @@ typedef struct pglist_data {
#ifdef CONFIG_LRU_GEN
/* kswap mm walk data */
- struct lru_gen_mm_walk mm_walk;
+ struct lru_gen_mm_walk mm_walk;
/* lru_gen_folio list */
struct lru_gen_memcg memcg_lru;
#endif
@@ -1745,7 +1731,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes)
#define SECTION_BLOCKFLAGS_BITS \
((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)
-#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
+#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_ORDER exceeds SECTION_SIZE
#endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1b89dd027d48..1c68d67b832f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -136,9 +136,6 @@ enum pageflags {
PG_arch_2,
PG_arch_3,
#endif
-#ifdef CONFIG_KASAN_HW_TAGS
- PG_skip_kasan_poison,
-#endif
__NR_PAGEFLAGS,
PG_readahead = PG_reclaim,
@@ -590,12 +587,6 @@ TESTCLEARFLAG(Young, young, PF_ANY)
PAGEFLAG(Idle, idle, PF_ANY)
#endif
-#ifdef CONFIG_KASAN_HW_TAGS
-PAGEFLAG(SkipKASanPoison, skip_kasan_poison, PF_HEAD)
-#else
-PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison)
-#endif
-
/*
* PageReported() is used to track reported free pages within the Buddy
* allocator. We can use the non-atomic version of the test and set
@@ -815,14 +806,9 @@ static inline void ClearPageCompound(struct page *page)
#ifdef CONFIG_HUGETLB_PAGE
int PageHuge(struct page *page);
-int PageHeadHuge(struct page *page);
-static inline bool folio_test_hugetlb(struct folio *folio)
-{
- return PageHeadHuge(&folio->page);
-}
+bool folio_test_hugetlb(struct folio *folio);
#else
TESTPAGEFLAG_FALSE(Huge, hugetlb)
-TESTPAGEFLAG_FALSE(HeadHuge, headhuge)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -917,9 +903,14 @@ static inline bool is_page_hwpoison(struct page *page)
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
+static inline int page_type_has_type(unsigned int page_type)
+{
+ return (int)page_type < PAGE_MAPCOUNT_RESERVE;
+}
+
static inline int page_has_type(struct page *page)
{
- return (int)page->page_type < PAGE_MAPCOUNT_RESERVE;
+ return page_type_has_type(page->page_type);
}
#define PAGE_TYPE_OPS(uname, lname) \
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index bc2e39090a1f..67314f648aeb 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -29,8 +29,6 @@ struct page_ext_operations {
bool need_shared_flags;
};
-extern bool deferred_struct_pages;
-
#ifdef CONFIG_PAGE_EXTENSION
/*
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 5f1ae07d724b..e83c4c095041 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,14 +41,14 @@ extern unsigned int pageblock_order;
* Huge pages are a constant size, but don't exceed the maximum allocation
* granularity.
*/
-#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER - 1)
+#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER)
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
#else /* CONFIG_HUGETLB_PAGE */
/* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
-#define pageblock_order (MAX_ORDER-1)
+#define pageblock_order MAX_ORDER
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c4698dcc70ba..a56308a9d1a4 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -504,11 +504,11 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
#define FGP_NOFS 0x00000010
#define FGP_NOWAIT 0x00000020
#define FGP_FOR_MMAP 0x00000040
-#define FGP_ENTRY 0x00000080
-#define FGP_STABLE 0x00000100
+#define FGP_STABLE 0x00000080
#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
+void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
int fgp_flags, gfp_t gfp);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
@@ -522,7 +522,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
* Looks up the page cache entry at @mapping & @index. If a folio is
* present, it is returned with an increased refcount.
*
- * Otherwise, %NULL is returned.
+ * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
+ * this index. Will not return a shadow, swap or DAX entry.
*/
static inline struct folio *filemap_get_folio(struct address_space *mapping,
pgoff_t index)
@@ -539,8 +540,8 @@ static inline struct folio *filemap_get_folio(struct address_space *mapping,
* present, it is returned locked with an increased refcount.
*
* Context: May sleep.
- * Return: A folio or %NULL if there is no folio in the cache for this
- * index. Will not return a shadow, swap or DAX entry.
+ * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
+ * this index. Will not return a shadow, swap or DAX entry.
*/
static inline struct folio *filemap_lock_folio(struct address_space *mapping,
pgoff_t index)
@@ -557,8 +558,8 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping,
* a new folio is created. The folio is locked, marked as accessed, and
* returned.
*
- * Return: A found or created folio. NULL if no folio is found and failed to
- * create a folio.
+ * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found
+ * and failed to create a folio.
*/
static inline struct folio *filemap_grab_folio(struct address_space *mapping,
pgoff_t index)
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c63cd44777ec..c5a51481bbb9 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -817,7 +817,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
#endif
#ifndef flush_tlb_fix_spurious_fault
-#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
+#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
#endif
/*
@@ -1191,9 +1191,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma,
}
/*
- * untrack_pfn_moved is called while mremapping a pfnmap for a new region.
+ * untrack_pfn_clear is called while mremapping a pfnmap for a new region
+ * or fails to copy pgtable during duplicate vm area.
*/
-static inline void untrack_pfn_moved(struct vm_area_struct *vma)
+static inline void untrack_pfn_clear(struct vm_area_struct *vma)
{
}
#else
@@ -1205,7 +1206,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
extern int track_pfn_copy(struct vm_area_struct *vma);
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size, bool mm_wr_locked);
-extern void untrack_pfn_moved(struct vm_area_struct *vma);
+extern void untrack_pfn_clear(struct vm_area_struct *vma);
#endif
#ifdef CONFIG_MMU
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 0e17ae7fbfd3..0ee96ea7a0e9 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -90,4 +90,5 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
+#define MMF_VM_MERGE_ANY 29
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 8c15abd67aed..fe1a46f30d24 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -46,6 +46,12 @@ static inline bool housekeeping_enabled(enum hk_type type)
static inline void housekeeping_affine(struct task_struct *t,
enum hk_type type) { }
+
+static inline bool housekeeping_test_cpu(int cpu, enum hk_type type)
+{
+ return true;
+}
+
static inline void housekeeping_init(void) { }
#endif /* CONFIG_CPU_ISOLATION */
@@ -58,4 +64,10 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type)
return true;
}
+static inline bool cpu_is_isolated(int cpu)
+{
+ return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) ||
+ !housekeeping_test_cpu(cpu, HK_TYPE_TICK);
+}
+
#endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2a243616f222..689dbe812563 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -79,6 +79,34 @@ static inline void mmdrop_sched(struct mm_struct *mm)
}
#endif
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+ mmdrop(mm);
+ } else {
+ /*
+ * mmdrop_lazy_tlb must provide a full memory barrier, see the
+ * membarrier comment finish_task_switch which relies on this.
+ */
+ smp_mb();
+ }
+}
+
+static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
+{
+ if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+ mmdrop_sched(mm);
+ else
+ smp_mb(); /* see mmdrop_lazy_tlb() above */
+}
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 103d1000a5a2..9029abd29b1c 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -9,6 +9,7 @@
#include <linux/percpu_counter.h>
#include <linux/xattr.h>
#include <linux/fs_parser.h>
+#include <linux/userfaultfd_k.h>
/* inode in-kernel data */
@@ -45,6 +46,7 @@ struct shmem_sb_info {
kuid_t uid; /* Mount uid for root directory */
kgid_t gid; /* Mount gid for root directory */
bool full_inums; /* If i_ino should be uint or ino_t */
+ bool noswap; /* ignores VM reclaim / swap requests */
ino_t next_ino; /* The next per-sb inode number to use */
ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */
struct mempolicy *mpol; /* default memory policy for mappings */
@@ -94,7 +96,14 @@ int shmem_unuse(unsigned int type);
extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
struct mm_struct *mm, unsigned long vm_flags);
+#ifdef CONFIG_SHMEM
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
+#else
+static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma)
+{
+ return 0;
+}
+#endif
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end);
@@ -151,15 +160,15 @@ extern void shmem_uncharge(struct inode *inode, long pages);
#ifdef CONFIG_USERFAULTFD
#ifdef CONFIG_SHMEM
-extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- bool zeropage, bool wp_copy,
- struct page **pagep);
+ uffd_flags_t flags,
+ struct folio **foliop);
#else /* !CONFIG_SHMEM */
-#define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
- src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; })
+#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
+ src_addr, flags, foliop) ({ BUG(); 0; })
#endif /* CONFIG_SHMEM */
#endif /* CONFIG_USERFAULTFD */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7db48f9f0d9d..6b3e155b70bf 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -167,7 +167,6 @@ struct mem_cgroup;
/*
* struct kmem_cache related prototypes
*/
-void __init kmem_cache_init(void);
bool slab_is_available(void);
struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
@@ -284,7 +283,7 @@ static inline unsigned int arch_slab_minalign(void)
* (PAGE_SIZE*2). Larger requests are passed to the page allocator.
*/
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
+#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 5
#endif
@@ -292,7 +291,7 @@ static inline unsigned int arch_slab_minalign(void)
#ifdef CONFIG_SLUB
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
+#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 3
#endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 209a425739a9..3c69cb653cb9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -153,13 +153,28 @@ union swap_header {
* memory reclaim
*/
struct reclaim_state {
- unsigned long reclaimed_slab;
+ /* pages reclaimed outside of LRU-based reclaim */
+ unsigned long reclaimed;
#ifdef CONFIG_LRU_GEN
/* per-thread mm walk data */
struct lru_gen_mm_walk *mm_walk;
#endif
};
+/*
+ * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
+ * reclaim
+ * @pages: number of pages reclaimed
+ *
+ * If the current process is undergoing a reclaim operation, increment the
+ * number of reclaimed pages by @pages.
+ */
+static inline void mm_account_reclaimed_pages(unsigned long pages)
+{
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed += pages;
+}
+
#ifdef __KERNEL__
struct address_space;
@@ -427,7 +442,6 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
long remove_mapping(struct address_space *mapping, struct folio *folio);
-extern unsigned long reclaim_pages(struct list_head *page_list);
#ifdef CONFIG_NUMA
extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio;
@@ -620,18 +634,18 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
/* Cgroup2 doesn't have per-cgroup swappiness */
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- return vm_swappiness;
+ return READ_ONCE(vm_swappiness);
/* root ? */
if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
- return vm_swappiness;
+ return READ_ONCE(vm_swappiness);
- return memcg->swappiness;
+ return READ_ONCE(memcg->swappiness);
}
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
- return vm_swappiness;
+ return READ_ONCE(vm_swappiness);
}
#endif
@@ -641,22 +655,18 @@ extern atomic_t zswap_stored_pages;
#endif
#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-extern void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
-static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
+void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
+static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
if (mem_cgroup_disabled())
return;
- __cgroup_throttle_swaprate(page, gfp_mask);
+ __folio_throttle_swaprate(folio, gfp);
}
#else
-static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
-{
-}
-#endif
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
- cgroup_throttle_swaprate(&folio->page, gfp);
}
+#endif
#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index ed35f4427a0a..3d386849a758 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -195,6 +195,8 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
{
return copy_page_to_iter(&folio->page, offset, bytes, i);
}
+size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
+ size_t bytes, struct iov_iter *i);
static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index fff49fec0258..d78b01524349 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -38,40 +38,55 @@
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
-/*
- * The mode of operation for __mcopy_atomic and its helpers.
- *
- * This is almost an implementation detail (mcopy_atomic below doesn't take this
- * as a parameter), but it's exposed here because memory-kind-specific
- * implementations (e.g. hugetlbfs) need to know the mode of operation.
- */
-enum mcopy_atomic_mode {
- /* A normal copy_from_user into the destination range. */
- MCOPY_ATOMIC_NORMAL,
- /* Don't copy; map the destination range to the zero page. */
- MCOPY_ATOMIC_ZEROPAGE,
- /* Just install pte(s) with the existing page(s) in the page cache. */
- MCOPY_ATOMIC_CONTINUE,
+/* A combined operation mode + behavior flags. */
+typedef unsigned int __bitwise uffd_flags_t;
+
+/* Mutually exclusive modes of operation. */
+enum mfill_atomic_mode {
+ MFILL_ATOMIC_COPY,
+ MFILL_ATOMIC_ZEROPAGE,
+ MFILL_ATOMIC_CONTINUE,
+ NR_MFILL_ATOMIC_MODES,
};
-extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1)
+#define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr))
+#define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr))
+#define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1))
+
+static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected)
+{
+ return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected);
+}
+
+static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode)
+{
+ flags &= ~MFILL_ATOMIC_MODE_MASK;
+ return flags | ((__force uffd_flags_t) mode);
+}
+
+/* Flags controlling behavior. These behavior changes are mode-independent. */
+#define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0)
+
+extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, struct page *page,
- bool newly_allocated, bool wp_copy);
-
-extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, __u64 mode);
-extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len,
- atomic_t *mmap_changing);
-extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long len, atomic_t *mmap_changing);
+ bool newly_allocated, uffd_flags_t flags);
+
+extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len,
+ atomic_t *mmap_changing, uffd_flags_t flags);
+extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len,
+ atomic_t *mmap_changing);
+extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
-extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
+extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
/* mm helpers */
@@ -177,6 +192,7 @@ extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
unsigned long end, struct list_head *uf);
extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
+extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
#else /* CONFIG_USERFAULTFD */
@@ -272,8 +288,30 @@ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma)
return false;
}
+static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+ return false;
+}
+
#endif /* CONFIG_USERFAULTFD */
+static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
+{
+ /* Only wr-protect mode uses pte markers */
+ if (!userfaultfd_wp(vma))
+ return false;
+
+ /* File-based uffd-wp always need markers */
+ if (!vma_is_anonymous(vma))
+ return true;
+
+ /*
+ * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
+ * enabled (to apply markers on zero pages).
+ */
+ return userfaultfd_wp_unpopulated(vma);
+}
+
static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry)
{
#ifdef CONFIG_PTE_MARKER_UFFD_WP
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 7f5d1caf5890..8abfa1240040 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -150,6 +150,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
DIRECT_MAP_LEVEL2_SPLIT,
DIRECT_MAP_LEVEL3_SPLIT,
#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+ VMA_LOCK_SUCCESS,
+ VMA_LOCK_ABORT,
+ VMA_LOCK_RETRY,
+ VMA_LOCK_MISS,
+#endif
NR_VM_EVENT_ITEMS
};
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 69250efa03d1..c720be70c8dd 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -14,6 +14,7 @@
struct vm_area_struct; /* vma defining user mapping in mm_types.h */
struct notifier_block; /* in notifier.h */
+struct iov_iter; /* in uio.h */
/* bits in flags of vmalloc's vm_struct below */
#define VM_IOREMAP 0x00000001 /* ioremap() and friends */
@@ -131,12 +132,8 @@ extern void *vm_map_ram(struct page **pages, unsigned int count, int node);
extern void vm_unmap_aliases(void);
#ifdef CONFIG_MMU
-extern void __init vmalloc_init(void);
extern unsigned long vmalloc_nr_pages(void);
#else
-static inline void vmalloc_init(void)
-{
-}
static inline unsigned long vmalloc_nr_pages(void) { return 0; }
#endif
@@ -251,7 +248,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
#endif
/* for /proc/kcore */
-extern long vread(char *buf, char *addr, unsigned long count);
+extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count);
/*
* Internals. Don't use..
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 19cf5b6892ce..fed855bae6d8 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif
+#ifdef CONFIG_PER_VMA_LOCK_STATS
+#define count_vm_vma_lock_event(x) count_vm_event(x)
+#else
+#define count_vm_vma_lock_event(x) do {} while (0)
+#endif
+
#define __count_zid_vm_events(item, zid, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)