summaryrefslogtreecommitdiff
path: root/include/linux/mm.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r--include/linux/mm.h729
1 files changed, 521 insertions, 208 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d16b33bacc32..7a1819c20643 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -105,6 +105,8 @@ extern int mmap_rnd_compat_bits __read_mostly;
# endif
#endif
+#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
+
#include <asm/page.h>
#include <asm/processor.h>
@@ -273,178 +275,235 @@ extern unsigned int kobjsize(const void *objp);
* vm_flags in vm_area_struct, see mm_types.h.
* When changing, update also include/trace/events/mmflags.h
*/
-#define VM_NONE 0x00000000
-#define VM_READ 0x00000001 /* currently active flags */
-#define VM_WRITE 0x00000002
-#define VM_EXEC 0x00000004
-#define VM_SHARED 0x00000008
+#define VM_NONE 0x00000000
-/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
-#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
-#define VM_MAYWRITE 0x00000020
-#define VM_MAYEXEC 0x00000040
-#define VM_MAYSHARE 0x00000080
+/**
+ * typedef vma_flag_t - specifies an individual VMA flag by bit number.
+ *
+ * This value is made type safe by sparse to avoid passing invalid flag values
+ * around.
+ */
+typedef int __bitwise vma_flag_t;
-#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
+#define DECLARE_VMA_BIT(name, bitnum) \
+ VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum)
+#define DECLARE_VMA_BIT_ALIAS(name, aliased) \
+ VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT)
+enum {
+ DECLARE_VMA_BIT(READ, 0),
+ DECLARE_VMA_BIT(WRITE, 1),
+ DECLARE_VMA_BIT(EXEC, 2),
+ DECLARE_VMA_BIT(SHARED, 3),
+ /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
+ DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */
+ DECLARE_VMA_BIT(MAYWRITE, 5),
+ DECLARE_VMA_BIT(MAYEXEC, 6),
+ DECLARE_VMA_BIT(MAYSHARE, 7),
+ DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */
#ifdef CONFIG_MMU
-#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
-#else /* CONFIG_MMU */
-#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
-#define VM_UFFD_MISSING 0
+ DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */
+#else
+ /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
+ DECLARE_VMA_BIT(MAYOVERLAY, 9),
#endif /* CONFIG_MMU */
-#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
-#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
-
-#define VM_LOCKED 0x00002000
-#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
-
- /* Used by sys_madvise() */
-#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
-#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
-
-#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
-#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
-#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
-#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
-#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
-#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
-#define VM_SYNC 0x00800000 /* Synchronous page faults */
-#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
-#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
-#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
-
+ /* Page-ranges managed without "struct page", just pure PFN */
+ DECLARE_VMA_BIT(PFNMAP, 10),
+ DECLARE_VMA_BIT(MAYBE_GUARD, 11),
+ DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */
+ DECLARE_VMA_BIT(LOCKED, 13),
+ DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */
+ DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */
+ DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */
+ DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */
+ DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */
+ DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */
+ DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */
+ DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */
+ DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */
+ DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */
+ DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */
+ DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */
+ DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */
+ DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */
+ DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */
+ DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */
+ DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */
+ DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */
+ /* These bits are reused, we define specific uses below. */
+ DECLARE_VMA_BIT(HIGH_ARCH_0, 32),
+ DECLARE_VMA_BIT(HIGH_ARCH_1, 33),
+ DECLARE_VMA_BIT(HIGH_ARCH_2, 34),
+ DECLARE_VMA_BIT(HIGH_ARCH_3, 35),
+ DECLARE_VMA_BIT(HIGH_ARCH_4, 36),
+ DECLARE_VMA_BIT(HIGH_ARCH_5, 37),
+ DECLARE_VMA_BIT(HIGH_ARCH_6, 38),
+ /*
+ * This flag is used to connect VFIO to arch specific KVM code. It
+ * indicates that the memory under this VMA is safe for use with any
+ * non-cachable memory type inside KVM. Some VFIO devices, on some
+ * platforms, are thought to be unsafe and can cause machine crashes
+ * if KVM does not lock down the memory type.
+ */
+ DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39),
+#ifdef CONFIG_PPC32
+ DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1),
+#else
+ DECLARE_VMA_BIT(DROPPABLE, 40),
+#endif
+ DECLARE_VMA_BIT(UFFD_MINOR, 41),
+ DECLARE_VMA_BIT(SEALED, 42),
+ /* Flags that reuse flags above. */
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3),
+ DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4),
+#if defined(CONFIG_X86_USER_SHADOW_STACK)
+ /*
+ * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
+ * support core mm.
+ *
+ * These VMAs will get a single end guard page. This helps userspace
+ * protect itself from attacks. A single page is enough for current
+ * shadow stack archs (x86). See the comments near alloc_shstk() in
+ * arch/x86/kernel/shstk.c for more details on the guard size.
+ */
+ DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5),
+#elif defined(CONFIG_ARM64_GCS)
+ /*
+ * arm64's Guarded Control Stack implements similar functionality and
+ * has similar constraints to shadow stacks.
+ */
+ DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6),
+#endif
+ DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */
+ DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */
+ DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */
+ DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */
+ DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */
+ DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */
+ DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */
+ DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */
+#ifdef CONFIG_STACK_GROWSUP
+ DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP),
+ DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN),
+#else
+ DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN),
+#endif
+};
+#undef DECLARE_VMA_BIT
+#undef DECLARE_VMA_BIT_ALIAS
+
+#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT)
+#define VM_READ INIT_VM_FLAG(READ)
+#define VM_WRITE INIT_VM_FLAG(WRITE)
+#define VM_EXEC INIT_VM_FLAG(EXEC)
+#define VM_SHARED INIT_VM_FLAG(SHARED)
+#define VM_MAYREAD INIT_VM_FLAG(MAYREAD)
+#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE)
+#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC)
+#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE)
+#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN)
+#ifdef CONFIG_MMU
+#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING)
+#else
+#define VM_UFFD_MISSING VM_NONE
+#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY)
+#endif
+#define VM_PFNMAP INIT_VM_FLAG(PFNMAP)
+#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD)
+#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP)
+#define VM_LOCKED INIT_VM_FLAG(LOCKED)
+#define VM_IO INIT_VM_FLAG(IO)
+#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ)
+#define VM_RAND_READ INIT_VM_FLAG(RAND_READ)
+#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY)
+#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND)
+#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT)
+#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT)
+#define VM_NORESERVE INIT_VM_FLAG(NORESERVE)
+#define VM_HUGETLB INIT_VM_FLAG(HUGETLB)
+#define VM_SYNC INIT_VM_FLAG(SYNC)
+#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1)
+#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK)
+#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP)
#ifdef CONFIG_MEM_SOFT_DIRTY
-# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
+#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY)
#else
-# define VM_SOFTDIRTY 0
+#define VM_SOFTDIRTY VM_NONE
+#endif
+#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP)
+#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE)
+#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE)
+#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE)
+#define VM_STACK INIT_VM_FLAG(STACK)
+#ifdef CONFIG_STACK_GROWS_UP
+#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY)
+#else
+#define VM_STACK_EARLY VM_NONE
#endif
-
-#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
-#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
-#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
-#define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */
-
-#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
-#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */
-#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
-#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
-#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
-#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
-#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
-#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
-#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6)
-#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
-
#ifdef CONFIG_ARCH_HAS_PKEYS
-# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
-# define VM_PKEY_BIT0 VM_HIGH_ARCH_0
-# define VM_PKEY_BIT1 VM_HIGH_ARCH_1
-# define VM_PKEY_BIT2 VM_HIGH_ARCH_2
+#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT)
+/* Despite the naming, these are FLAGS not bits. */
+#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0)
+#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1)
+#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2)
#if CONFIG_ARCH_PKEY_BITS > 3
-# define VM_PKEY_BIT3 VM_HIGH_ARCH_3
+#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3)
#else
-# define VM_PKEY_BIT3 0
-#endif
+#define VM_PKEY_BIT3 VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 3 */
#if CONFIG_ARCH_PKEY_BITS > 4
-# define VM_PKEY_BIT4 VM_HIGH_ARCH_4
+#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4)
#else
-# define VM_PKEY_BIT4 0
-#endif
+#define VM_PKEY_BIT4 VM_NONE
+#endif /* CONFIG_ARCH_PKEY_BITS > 4 */
#endif /* CONFIG_ARCH_HAS_PKEYS */
-
-#ifdef CONFIG_X86_USER_SHADOW_STACK
-/*
- * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
- * support core mm.
- *
- * These VMAs will get a single end guard page. This helps userspace protect
- * itself from attacks. A single page is enough for current shadow stack archs
- * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c
- * for more details on the guard size.
- */
-# define VM_SHADOW_STACK VM_HIGH_ARCH_5
-#endif
-
-#if defined(CONFIG_ARM64_GCS)
-/*
- * arm64's Guarded Control Stack implements similar functionality and
- * has similar constraints to shadow stacks.
- */
-# define VM_SHADOW_STACK VM_HIGH_ARCH_6
-#endif
-
-#ifndef VM_SHADOW_STACK
-# define VM_SHADOW_STACK VM_NONE
+#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS)
+#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK)
+#else
+#define VM_SHADOW_STACK VM_NONE
#endif
-
#if defined(CONFIG_PPC64)
-# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */
+#define VM_SAO INIT_VM_FLAG(SAO)
#elif defined(CONFIG_PARISC)
-# define VM_GROWSUP VM_ARCH_1
+#define VM_GROWSUP INIT_VM_FLAG(GROWSUP)
#elif defined(CONFIG_SPARC64)
-# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */
-# define VM_ARCH_CLEAR VM_SPARC_ADI
+#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI)
+#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR)
#elif defined(CONFIG_ARM64)
-# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */
-# define VM_ARCH_CLEAR VM_ARM64_BTI
+#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI)
+#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR)
#elif !defined(CONFIG_MMU)
-# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */
+#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY)
#endif
-
-#if defined(CONFIG_ARM64_MTE)
-# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */
-# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */
-#else
-# define VM_MTE VM_NONE
-# define VM_MTE_ALLOWED VM_NONE
-#endif
-
#ifndef VM_GROWSUP
-# define VM_GROWSUP VM_NONE
+#define VM_GROWSUP VM_NONE
+#endif
+#ifdef CONFIG_ARM64_MTE
+#define VM_MTE INIT_VM_FLAG(MTE)
+#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED)
+#else
+#define VM_MTE VM_NONE
+#define VM_MTE_ALLOWED VM_NONE
#endif
-
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-# define VM_UFFD_MINOR_BIT 41
-# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
-#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-# define VM_UFFD_MINOR VM_NONE
-#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-
-/*
- * This flag is used to connect VFIO to arch specific KVM code. It
- * indicates that the memory under this VMA is safe for use with any
- * non-cachable memory type inside KVM. Some VFIO devices, on some
- * platforms, are thought to be unsafe and can cause machine crashes
- * if KVM does not lock down the memory type.
- */
-#ifdef CONFIG_64BIT
-#define VM_ALLOW_ANY_UNCACHED_BIT 39
-#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT)
+#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR)
#else
-#define VM_ALLOW_ANY_UNCACHED VM_NONE
+#define VM_UFFD_MINOR VM_NONE
#endif
-
#ifdef CONFIG_64BIT
-#define VM_DROPPABLE_BIT 40
-#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT)
-#elif defined(CONFIG_PPC32)
-#define VM_DROPPABLE VM_ARCH_1
+#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED)
+#define VM_SEALED INIT_VM_FLAG(SEALED)
#else
-#define VM_DROPPABLE VM_NONE
+#define VM_ALLOW_ANY_UNCACHED VM_NONE
+#define VM_SEALED VM_NONE
#endif
-
-#ifdef CONFIG_64BIT
-#define VM_SEALED_BIT 42
-#define VM_SEALED BIT(VM_SEALED_BIT)
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE)
#else
-#define VM_SEALED VM_NONE
+#define VM_DROPPABLE VM_NONE
#endif
/* Bits set in the VMA until the stack is in its final location */
@@ -470,12 +529,10 @@ extern unsigned int kobjsize(const void *objp);
#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)
-#ifdef CONFIG_STACK_GROWSUP
-#define VM_STACK VM_GROWSUP
-#define VM_STACK_EARLY VM_GROWSDOWN
+#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
+#define VM_SEALED_SYSMAP VM_SEALED
#else
-#define VM_STACK VM_GROWSDOWN
-#define VM_STACK_EARLY 0
+#define VM_SEALED_SYSMAP VM_NONE
#endif
#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
@@ -483,12 +540,26 @@ extern unsigned int kobjsize(const void *objp);
/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
-
/*
* Special vmas that are non-mergable, non-mlock()able.
*/
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+/*
+ * Physically remapped pages are special. Tell the
+ * rest of the world about it:
+ * VM_IO tells people not to look at these pages
+ * (accesses can have side effects).
+ * VM_PFNMAP tells the core MM that the base pages are just
+ * raw PFN mappings, and do not have a "struct page" associated
+ * with them.
+ * VM_DONTEXPAND
+ * Disable vma merging and expanding with mremap().
+ * VM_DONTDUMP
+ * Omit vma from core dump, even when VM_IO turned off.
+ */
+#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP)
+
/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
@@ -498,13 +569,69 @@ extern unsigned int kobjsize(const void *objp);
/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
+/* These flags can be updated atomically via VMA/mmap read lock. */
+#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD
+
/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
-# define VM_ARCH_CLEAR VM_NONE
+#define VM_ARCH_CLEAR VM_NONE
#endif
#define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)
/*
+ * Flags which should be 'sticky' on merge - that is, flags which, when one VMA
+ * possesses it but the other does not, the merged VMA should nonetheless have
+ * applied to it:
+ *
+ * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its
+ * references cleared via /proc/$pid/clear_refs, any merged VMA
+ * should be considered soft-dirty also as it operates at a VMA
+ * granularity.
+ *
+ * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that
+ * mapped page tables may contain metadata not described by the
+ * VMA and thus any merged VMA may also contain this metadata,
+ * and thus we must make this flag sticky.
+ */
+#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD)
+
+/*
+ * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one
+ * of these flags and the other not does not preclude a merge.
+ *
+ * VM_STICKY - When merging VMAs, VMA flags must match, unless they are
+ * 'sticky'. If any sticky flags exist in either VMA, we simply
+ * set all of them on the merged VMA.
+ */
+#define VM_IGNORE_MERGE VM_STICKY
+
+/*
+ * Flags which should result in page tables being copied on fork. These are
+ * flags which indicate that the VMA maps page tables which cannot be
+ * reconsistuted upon page fault, so necessitate page table copying upon
+ *
+ * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be
+ * reasonably reconstructed on page fault.
+ *
+ * VM_UFFD_WP - Encodes metadata about an installed uffd
+ * write protect handler, which cannot be
+ * reconstructed on page fault.
+ *
+ * We always copy pgtables when dst_vma has uffd-wp
+ * enabled even if it's file-backed
+ * (e.g. shmem). Because when uffd-wp is enabled,
+ * pgtable contains uffd-wp protection information,
+ * that's something we can't retrieve from page cache,
+ * and skip copying will lose those info.
+ *
+ * VM_MAYBE_GUARD - Could contain page guard region markers which
+ * by design are a property of the page tables
+ * only and thus cannot be reconstructed on page
+ * fault.
+ */
+#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD)
+
+/*
* mapping from the currently active vm_flags protection bits (the
* low four bits) to a page protection mask..
*/
@@ -783,7 +910,9 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
static inline void vm_flags_init(struct vm_area_struct *vma,
vm_flags_t flags)
{
- ACCESS_PRIVATE(vma, __vm_flags) = flags;
+ VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
+ vma_flags_clear_all(&vma->flags);
+ vma_flags_overwrite_word(&vma->flags, flags);
}
/*
@@ -794,6 +923,7 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
static inline void vm_flags_reset(struct vm_area_struct *vma,
vm_flags_t flags)
{
+ VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
vma_assert_write_locked(vma);
vm_flags_init(vma, flags);
}
@@ -802,21 +932,33 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma,
vm_flags_t flags)
{
vma_assert_write_locked(vma);
- WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
+ /*
+ * If VMA flags exist beyond the first system word, also clear these. It
+ * is assumed the write once behaviour is required only for the first
+ * system word.
+ */
+ if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) {
+ unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags);
+
+ bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG);
+ }
+
+ vma_flags_overwrite_word_once(&vma->flags, flags);
}
static inline void vm_flags_set(struct vm_area_struct *vma,
vm_flags_t flags)
{
vma_start_write(vma);
- ACCESS_PRIVATE(vma, __vm_flags) |= flags;
+ vma_flags_set_word(&vma->flags, flags);
}
static inline void vm_flags_clear(struct vm_area_struct *vma,
vm_flags_t flags)
{
+ VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY));
vma_start_write(vma);
- ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
+ vma_flags_clear_word(&vma->flags, flags);
}
/*
@@ -840,6 +982,51 @@ static inline void vm_flags_mod(struct vm_area_struct *vma,
__vm_flags_mod(vma, set, clear);
}
+static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma,
+ vma_flag_t bit)
+{
+ const vm_flags_t mask = BIT((__force int)bit);
+
+ /* Only specific flags are permitted */
+ if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED)))
+ return false;
+
+ return true;
+}
+
+/*
+ * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific
+ * valid flags are allowed to do this.
+ */
+static inline void vma_flag_set_atomic(struct vm_area_struct *vma,
+ vma_flag_t bit)
+{
+ unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags);
+
+ /* mmap read lock/VMA read lock must be held. */
+ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
+ vma_assert_locked(vma);
+
+ if (__vma_flag_atomic_valid(vma, bit))
+ set_bit((__force int)bit, bitmap);
+}
+
+/*
+ * Test for VMA flag atomically. Requires no locks. Only specific valid flags
+ * are allowed to do this.
+ *
+ * This is necessarily racey, so callers must ensure that serialisation is
+ * achieved through some other means, or that races are permissible.
+ */
+static inline bool vma_flag_test_atomic(struct vm_area_struct *vma,
+ vma_flag_t bit)
+{
+ if (__vma_flag_atomic_valid(vma, bit))
+ return test_bit((__force int)bit, &vma->vm_flags);
+
+ return false;
+}
+
static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
vma->vm_ops = NULL;
@@ -2074,7 +2261,7 @@ static inline unsigned long folio_nr_pages(const struct folio *folio)
return folio_large_nr_pages(folio);
}
-#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE)
+#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS)
/*
* We don't expect any folios that exceed buddy sizes (and consequently
* memory sections).
@@ -2087,10 +2274,17 @@ static inline unsigned long folio_nr_pages(const struct folio *folio)
* pages are guaranteed to be contiguous.
*/
#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT
-#else
+#elif defined(CONFIG_HUGETLB_PAGE)
/*
* There is no real limit on the folio size. We limit them to the maximum we
- * currently expect (e.g., hugetlb, dax).
+ * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
+ * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
+ */
+#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
+#else
+/*
+ * Without hugetlb, gigantic folios that are bigger than a single PUD are
+ * currently impossible.
*/
#define MAX_FOLIO_ORDER PUD_ORDER
#endif
@@ -2401,31 +2595,6 @@ struct zap_details {
/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
-#ifdef CONFIG_SCHED_MM_CID
-void sched_mm_cid_before_execve(struct task_struct *t);
-void sched_mm_cid_after_execve(struct task_struct *t);
-void sched_mm_cid_fork(struct task_struct *t);
-void sched_mm_cid_exit_signals(struct task_struct *t);
-static inline int task_mm_cid(struct task_struct *t)
-{
- return t->mm_cid;
-}
-#else
-static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_fork(struct task_struct *t) { }
-static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
-static inline int task_mm_cid(struct task_struct *t)
-{
- /*
- * Use the processor id as a fall-back when the mm cid feature is
- * disabled. This provides functional per-cpu data structure accesses
- * in user-space, althrough it won't provide the memory usage benefits.
- */
- return raw_smp_processor_id();
-}
-#endif
-
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
@@ -2456,7 +2625,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma)
}
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long start,
- unsigned long end, unsigned long tree_end, bool mm_wr_locked);
+ unsigned long end, unsigned long tree_end);
struct mmu_notifier_range;
@@ -2940,6 +3109,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
#endif /* CONFIG_MMU */
enum pt_flags {
+ PT_kernel = PG_referenced,
PT_reserved = PG_reserved,
/* High bits are used for zone/node/section */
};
@@ -2966,6 +3136,46 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt)
}
/**
+ * ptdesc_set_kernel - Mark a ptdesc used to map the kernel
+ * @ptdesc: The ptdesc to be marked
+ *
+ * Kernel page tables often need special handling. Set a flag so that
+ * the handling code knows this ptdesc will not be used for userspace.
+ */
+static inline void ptdesc_set_kernel(struct ptdesc *ptdesc)
+{
+ set_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
+/**
+ * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel
+ * @ptdesc: The ptdesc to be unmarked
+ *
+ * Use when the ptdesc is no longer used to map the kernel and no longer
+ * needs special handling.
+ */
+static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc)
+{
+ /*
+ * Note: the 'PG_referenced' bit does not strictly need to be
+ * cleared before freeing the page. But this is nice for
+ * symmetry.
+ */
+ clear_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
+/**
+ * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel
+ * @ptdesc: The ptdesc being tested
+ *
+ * Call to tell if the ptdesc used to map the kernel.
+ */
+static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc)
+{
+ return test_bit(PT_kernel, &ptdesc->pt_flags.f);
+}
+
+/**
* pagetable_alloc - Allocate pagetables
* @gfp: GFP flags
* @order: desired pagetable order
@@ -2983,6 +3193,21 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde
}
#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
+static inline void __pagetable_free(struct ptdesc *pt)
+{
+ struct page *page = ptdesc_page(pt);
+
+ __free_pages(page, compound_order(page));
+}
+
+#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
+void pagetable_free_kernel(struct ptdesc *pt);
+#else
+static inline void pagetable_free_kernel(struct ptdesc *pt)
+{
+ __pagetable_free(pt);
+}
+#endif
/**
* pagetable_free - Free pagetables
* @pt: The page table descriptor
@@ -2992,9 +3217,12 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde
*/
static inline void pagetable_free(struct ptdesc *pt)
{
- struct page *page = ptdesc_page(pt);
-
- __free_pages(page, compound_order(page));
+ if (ptdesc_test_kernel(pt)) {
+ ptdesc_clear_kernel(pt);
+ pagetable_free_kernel(pt);
+ } else {
+ __pagetable_free(pt);
+ }
}
#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
@@ -3369,6 +3597,8 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
struct rb_root_cached *root);
+struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node,
+ unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
@@ -3495,10 +3725,10 @@ struct vm_unmapped_area_info {
extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
/* truncate.c */
-extern void truncate_inode_pages(struct address_space *, loff_t);
-extern void truncate_inode_pages_range(struct address_space *,
- loff_t lstart, loff_t lend);
-extern void truncate_inode_pages_final(struct address_space *);
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart);
+void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart,
+ uoff_t lend);
+void truncate_inode_pages_final(struct address_space *mapping);
/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
@@ -3576,6 +3806,90 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma)
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
+static inline unsigned long vma_desc_size(const struct vm_area_desc *desc)
+{
+ return desc->end - desc->start;
+}
+
+static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc)
+{
+ return vma_desc_size(desc) >> PAGE_SHIFT;
+}
+
+/**
+ * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN
+ * remap is required.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start: The virtual address to start the remap from, must be within the VMA.
+ * @start_pfn: The first PFN in the range to remap.
+ * @size: The size of the range to remap, in bytes, at most spanning to the end
+ * of the VMA.
+ */
+static inline void mmap_action_remap(struct vm_area_desc *desc,
+ unsigned long start,
+ unsigned long start_pfn,
+ unsigned long size)
+{
+ struct mmap_action *action = &desc->action;
+
+ /* [start, start + size) must be within the VMA. */
+ WARN_ON_ONCE(start < desc->start || start >= desc->end);
+ WARN_ON_ONCE(start + size > desc->end);
+
+ action->type = MMAP_REMAP_PFN;
+ action->remap.start = start;
+ action->remap.start_pfn = start_pfn;
+ action->remap.size = size;
+ action->remap.pgprot = desc->page_prot;
+}
+
+/**
+ * mmap_action_remap_full - helper for mmap_prepare hook to specify that the
+ * entirety of a VMA should be PFN remapped.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start_pfn: The first PFN in the range to remap.
+ */
+static inline void mmap_action_remap_full(struct vm_area_desc *desc,
+ unsigned long start_pfn)
+{
+ mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc));
+}
+
+/**
+ * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN
+ * I/O remap is required.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start: The virtual address to start the remap from, must be within the VMA.
+ * @start_pfn: The first PFN in the range to remap.
+ * @size: The size of the range to remap, in bytes, at most spanning to the end
+ * of the VMA.
+ */
+static inline void mmap_action_ioremap(struct vm_area_desc *desc,
+ unsigned long start,
+ unsigned long start_pfn,
+ unsigned long size)
+{
+ mmap_action_remap(desc, start, start_pfn, size);
+ desc->action.type = MMAP_IO_REMAP_PFN;
+}
+
+/**
+ * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the
+ * entirety of a VMA should be PFN I/O remapped.
+ * @desc: The VMA descriptor for the VMA requiring remap.
+ * @start_pfn: The first PFN in the range to remap.
+ */
+static inline void mmap_action_ioremap_full(struct vm_area_desc *desc,
+ unsigned long start_pfn)
+{
+ mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc));
+}
+
+void mmap_action_prepare(struct mmap_action *action,
+ struct vm_area_desc *desc);
+int mmap_action_complete(struct mmap_action *action,
+ struct vm_area_struct *vma);
+
/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
unsigned long vm_start, unsigned long vm_end)
@@ -3617,10 +3931,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
unsigned long addr);
-int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t);
-int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot);
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t pgprot);
+
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num);
@@ -3653,15 +3966,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
-#ifndef io_remap_pfn_range
-static inline int io_remap_pfn_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long pfn,
- unsigned long size, pgprot_t prot)
+#ifndef io_remap_pfn_range_pfn
+static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
+ unsigned long size)
{
- return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
+ return pfn;
}
#endif
+static inline int io_remap_pfn_range(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long orig_pfn,
+ unsigned long size, pgprot_t orig_prot)
+{
+ const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size);
+ const pgprot_t prot = pgprot_decrypted(orig_prot);
+
+ return remap_pfn_range(vma, addr, pfn, size, prot);
+}
+
static inline vm_fault_t vmf_error(int err)
{
if (err == -ENOMEM)
@@ -4110,6 +4432,7 @@ enum mf_action_page_type {
MF_MSG_DAX,
MF_MSG_UNSPLIT_THP,
MF_MSG_ALREADY_POISONED,
+ MF_MSG_PFN_MAP,
MF_MSG_UNKNOWN,
};
@@ -4238,16 +4561,6 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st
int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
-
-/*
- * mseal of userspace process's system mappings.
- */
-#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
-#define VM_SEALED_SYSMAP VM_SEALED
-#else
-#define VM_SEALED_SYSMAP VM_NONE
-#endif
-
/*
* DMA mapping IDs for page_pool
*