diff options
Diffstat (limited to 'include/linux/mm.h')
| -rw-r--r-- | include/linux/mm.h | 4725 |
1 files changed, 3762 insertions, 963 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index f0224608d15e..7a1819c20643 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1,44 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_H #define _LINUX_MM_H #include <linux/errno.h> - -#ifdef __KERNEL__ - +#include <linux/mmdebug.h> #include <linux/gfp.h> +#include <linux/pgalloc_tag.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> +#include <linux/compiler.h> #include <linux/mm_types.h> +#include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> +#include <linux/percpu-refcount.h> #include <linux/bit_spinlock.h> #include <linux/shrinker.h> +#include <linux/resource.h> +#include <linux/page_ext.h> +#include <linux/err.h> +#include <linux/page-flags.h> +#include <linux/page_ref.h> +#include <linux/overflow.h> +#include <linux/sizes.h> +#include <linux/sched.h> +#include <linux/pgtable.h> +#include <linux/kasan.h> +#include <linux/memremap.h> +#include <linux/slab.h> +#include <linux/cacheinfo.h> +#include <linux/rcuwait.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; -struct file_ra_state; struct user_struct; -struct writeback_control; +struct pt_regs; +struct folio_batch; -#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ -extern unsigned long max_mapnr; +void arch_mm_preinit(void); +void mm_core_init(void); +void init_mm_internals(void); -static inline void set_max_mapnr(unsigned long limit) +extern atomic_long_t _totalram_pages; +static inline unsigned long totalram_pages(void) { - max_mapnr = limit; + return (unsigned long)atomic_long_read(&_totalram_pages); +} + +static inline void totalram_pages_inc(void) +{ + atomic_long_inc(&_totalram_pages); +} + +static inline void totalram_pages_dec(void) +{ + atomic_long_dec(&_totalram_pages); +} + +static inline void totalram_pages_add(long count) +{ + atomic_long_add(count, &_totalram_pages); } -#else -static inline void set_max_mapnr(unsigned long limit) { } -#endif -extern unsigned long totalram_pages; extern void * high_memory; -extern int page_cluster; + +/* + * Convert between pages and MB + * 20 is the shift for 1MB (2^20 = 1MB) + * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages) + * So (20 - PAGE_SHIFT) converts between pages and MB + */ +#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT)) +#define MB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; @@ -46,20 +86,170 @@ extern int sysctl_legacy_va_layout; #define sysctl_legacy_va_layout 0 #endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS +extern const int mmap_rnd_bits_min; +extern int mmap_rnd_bits_max __ro_after_init; +extern int mmap_rnd_bits __read_mostly; +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS +extern const int mmap_rnd_compat_bits_min; +extern const int mmap_rnd_compat_bits_max; +extern int mmap_rnd_compat_bits __read_mostly; +#endif + +#ifndef DIRECT_MAP_PHYSMEM_END +# ifdef MAX_PHYSMEM_BITS +# define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) +# else +# define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) +# endif +#endif + +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/processor.h> +#ifndef __pa_symbol +#define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) +#endif + +#ifndef page_to_virt +#define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) +#endif + +#ifndef lm_alias +#define lm_alias(x) __va(__pa_symbol(x)) +#endif + +/* + * To prevent common memory management code establishing + * a zero page mapping on a read fault. + * This macro should be defined within <asm/pgtable.h>. + * s390 does this to prevent multiplexing of hardware bits + * related to the physical page in case of virtualization. + */ +#ifndef mm_forbids_zeropage +#define mm_forbids_zeropage(X) (0) +#endif + +/* + * On some architectures it is expensive to call memset() for small sizes. + * If an architecture decides to implement their own version of + * mm_zero_struct_page they should wrap the defines below in a #ifndef and + * define their own version of this macro in <asm/pgtable.h> + */ +#if BITS_PER_LONG == 64 +/* This function must be updated when the size of struct page grows above 96 + * or reduces below 56. The idea that compiler optimizes out switch() + * statement, and only leaves move/store instructions. Also the compiler can + * combine write statements if they are both assignments and can be reordered, + * this can result in several of the writes here being dropped. + */ +#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) +static inline void __mm_zero_struct_page(struct page *page) +{ + unsigned long *_pp = (void *)page; + + /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ + BUILD_BUG_ON(sizeof(struct page) & 7); + BUILD_BUG_ON(sizeof(struct page) < 56); + BUILD_BUG_ON(sizeof(struct page) > 96); + + switch (sizeof(struct page)) { + case 96: + _pp[11] = 0; + fallthrough; + case 88: + _pp[10] = 0; + fallthrough; + case 80: + _pp[9] = 0; + fallthrough; + case 72: + _pp[8] = 0; + fallthrough; + case 64: + _pp[7] = 0; + fallthrough; + case 56: + _pp[6] = 0; + _pp[5] = 0; + _pp[4] = 0; + _pp[3] = 0; + _pp[2] = 0; + _pp[1] = 0; + _pp[0] = 0; + } +} +#else +#define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) +#endif + +/* + * Default maximum number of active map areas, this limits the number of vmas + * per mm struct. Users can overwrite this number by sysctl but there is a + * problem. + * + * When a program's coredump is generated as ELF format, a section is created + * per a vma. In ELF, the number of sections is represented in unsigned short. + * This means the number of sections should be smaller than 65535 at coredump. + * Because the kernel adds some informative sections to a image of program at + * generating coredump, we need some margin. The number of extra sections is + * 1-3 now and depends on arch. We use "5" as safe margin, here. + * + * ELF extended numbering allows more than 65535 sections, so 16-bit bound is + * not a hard limit any more. Although some userspace tools can be surprised by + * that. + */ +#define MAPCOUNT_ELF_CORE_MARGIN (5) +#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) + +extern int sysctl_max_map_count; + extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +bool page_range_contiguous(const struct page *page, unsigned long nr_pages); +#else +static inline bool page_range_contiguous(const struct page *page, + unsigned long nr_pages) +{ + return true; +} +#endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) +/* to align the pointer to the (prev) page boundary */ +#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) + /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ -#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)addr, PAGE_SIZE) +#define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) + +/** + * folio_page_idx - Return the number of a page in a folio. + * @folio: The folio. + * @page: The folio page. + * + * This function expects that the page is actually part of the folio. + * The returned number is relative to the start of the folio. + */ +static inline unsigned long folio_page_idx(const struct folio *folio, + const struct page *page) +{ + return page - &folio->page; +} + +static inline struct folio *lru_to_folio(struct list_head *head) +{ + return list_entry((head)->prev, struct folio, lru); +} + +void setup_initial_init_mm(void *start_code, void *end_code, + void *end_data, void *brk); /* * Linux kernel virtual memory manager primitives. @@ -70,7 +260,9 @@ extern unsigned long sysctl_admin_reserve_kbytes; * mmap() functions). */ -extern struct kmem_cache *vm_area_cachep; +struct vm_area_struct *vm_area_alloc(struct mm_struct *); +struct vm_area_struct *vm_area_dup(struct vm_area_struct *); +void vm_area_free(struct vm_area_struct *); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; @@ -81,135 +273,512 @@ extern unsigned int kobjsize(const void *objp); /* * vm_flags in vm_area_struct, see mm_types.h. + * When changing, update also include/trace/events/mmflags.h */ + #define VM_NONE 0x00000000 -#define VM_READ 0x00000001 /* currently active flags */ -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 - -/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ -#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_MAYSHARE 0x00000080 - -#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ -#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ - -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ - - /* Used by sys_madvise() */ -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ - -#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ -#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ -#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ -#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ - -#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ -#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ -#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ -#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ - -#if defined(CONFIG_X86) -# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ -#elif defined(CONFIG_PPC) -# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. + * + * This value is made type safe by sparse to avoid passing invalid flag values + * around. + */ +typedef int __bitwise vma_flag_t; + +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT) +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ +#ifdef CONFIG_MMU + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), +#endif /* CONFIG_MMU */ + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ +#ifdef CONFIG_STACK_GROWSUP + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; +#undef DECLARE_VMA_BIT +#undef DECLARE_VMA_BIT_ALIAS + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) +#else +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) +#if CONFIG_ARCH_PKEY_BITS > 3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) +#else +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ +#if CONFIG_ARCH_PKEY_BITS > 4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) +#else +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE +#endif +#if defined(CONFIG_PPC64) +#define VM_SAO INIT_VM_FLAG(SAO) #elif defined(CONFIG_PARISC) -# define VM_GROWSUP VM_ARCH_1 -#elif defined(CONFIG_METAG) -# define VM_GROWSUP VM_ARCH_1 -#elif defined(CONFIG_IA64) -# define VM_GROWSUP VM_ARCH_1 +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) +#elif defined(CONFIG_SPARC64) +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif defined(CONFIG_ARM64) +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif !defined(CONFIG_MMU) -# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) #endif - #ifndef VM_GROWSUP -# define VM_GROWSUP VM_NONE +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE +#endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) +#else +#define VM_UFFD_MINOR VM_NONE +#endif +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#else +#define VM_DROPPABLE VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ -#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ + VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif -#ifdef CONFIG_STACK_GROWSUP -#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + +#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS +#define VM_SEALED_SYSMAP VM_SEALED #else -#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#define VM_SEALED_SYSMAP VM_NONE #endif +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) + +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + /* * Special vmas that are non-mergable, non-mlock()able. - * Note: mm/huge_memory.c VM_NO_THP depends on this definition. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP) +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + +/* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. + */ +#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) + +/* This mask prevents VMA from being scanned with khugepaged */ +#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) + +/* This mask defines which mm->def_flags a process can inherit its parent */ +#define VM_INIT_DEF_MASK VM_NOHUGEPAGE + +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) + +/* These flags can be updated atomically via VMA/mmap read lock. */ +#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD + +/* Arch-specific flags to clear when updating VM flags on protection change */ +#ifndef VM_ARCH_CLEAR +#define VM_ARCH_CLEAR VM_NONE +#endif +#define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) + +/* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. + * + * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that + * mapped page tables may contain metadata not described by the + * VMA and thus any merged VMA may also contain this metadata, + * and thus we must make this flag sticky. + */ +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE VM_STICKY + +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ -extern pgprot_t protection_map[16]; -#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ -#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ -#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ -#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ -#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ -#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ -#define FAULT_FLAG_TRIED 0x40 /* second try */ +/* + * The default fault flags that should be used by most of the + * arch-specific page fault handlers. + */ +#define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ + FAULT_FLAG_KILLABLE | \ + FAULT_FLAG_INTERRUPTIBLE) + +/** + * fault_flag_allow_retry_first - check ALLOW_RETRY the first time + * @flags: Fault flags. + * + * This is mostly used for places where we want to try to avoid taking + * the mmap_lock for too long a time when waiting for another condition + * to change, in which case we can try to be polite to release the + * mmap_lock in the first round to avoid potential starvation of other + * processes that would also want the mmap_lock. + * + * Return: true if the page fault allows retry and this is the first + * attempt of the fault handling; false otherwise. + */ +static inline bool fault_flag_allow_retry_first(enum fault_flag flags) +{ + return (flags & FAULT_FLAG_ALLOW_RETRY) && + (!(flags & FAULT_FLAG_TRIED)); +} + +#define FAULT_FLAG_TRACE \ + { FAULT_FLAG_WRITE, "WRITE" }, \ + { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ + { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ + { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ + { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ + { FAULT_FLAG_TRIED, "TRIED" }, \ + { FAULT_FLAG_USER, "USER" }, \ + { FAULT_FLAG_REMOTE, "REMOTE" }, \ + { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ + { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ + { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } /* - * vm_fault is filled by the the pagefault handler and passed to the vma's + * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * - * pgoff should be used in favour of virtual_address, if possible. If pgoff - * is used, one may implement ->remap_pages to get nonlinear mapping support. + * MM layer fills up gfp_mask for page allocations but fault handler might + * alter it if its implementation requires a different allocation context. + * + * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { - unsigned int flags; /* FAULT_FLAG_xxx flags */ - pgoff_t pgoff; /* Logical page offset based on vma */ - void __user *virtual_address; /* Faulting virtual address */ + const struct { + struct vm_area_struct *vma; /* Target VMA */ + gfp_t gfp_mask; /* gfp mask to be used for allocations */ + pgoff_t pgoff; /* Logical page offset based on vma */ + unsigned long address; /* Faulting virtual address - masked */ + unsigned long real_address; /* Faulting virtual address - unmasked */ + }; + enum fault_flag flags; /* FAULT_FLAG_xxx flags + * XXX: should really be 'const' */ + pmd_t *pmd; /* Pointer to pmd entry matching + * the 'address' */ + pud_t *pud; /* Pointer to pud entry matching + * the 'address' + */ + union { + pte_t orig_pte; /* Value of PTE at the time of fault */ + pmd_t orig_pmd; /* Value of PMD at the time of fault, + * used by PMD fault only. + */ + }; + struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ + /* These three entries are valid only while holding ptl lock */ + pte_t *pte; /* Pointer to pte entry matching + * the 'address'. NULL if the page + * table hasn't been allocated. + */ + spinlock_t *ptl; /* Page table lock. + * Protects pte page table if 'pte' + * is not NULL, otherwise pmd. + */ + pgtable_t prealloc_pte; /* Pre-allocated pte page table. + * vm_ops->map_pages() sets up a page + * table from atomic context. + * do_fault_around() pre-allocates + * page table to avoid allocation from + * atomic context. + */ }; /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer - * to the functions called when a no-page or a wp-page exception occurs. + * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ void (*close)(struct vm_area_struct * area); - int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); + /* Called any time before splitting to check if it's allowed */ + int (*may_split)(struct vm_area_struct *area, unsigned long addr); + int (*mremap)(struct vm_area_struct *area); + /* + * Called by mprotect() to make driver-specific permission + * checks before mprotect() is finalised. The VMA must not + * be modified. Returns 0 if mprotect() can proceed. + */ + int (*mprotect)(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags); + vm_fault_t (*fault)(struct vm_fault *vmf); + vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); + vm_fault_t (*map_pages)(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff); + unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ - int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); + vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); + + /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ + vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically - * for use by special VMAs that can switch between memory and hardware + * for use by special VMAs. See also generic_access_phys() for a generic + * implementation useful for any iomem mapping. */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); + + /* Called by the /proc/PID/maps code to ask the vma whether it + * has a special name. Returning non-NULL will also cause this + * vma to be dumped unconditionally. */ + const char *(*name)(struct vm_area_struct *vma); + #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy @@ -225,287 +794,685 @@ struct vm_operations_struct { * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not - * marked as MPOL_SHARED. vma policies are protected by the mmap_sem. + * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, - unsigned long addr); - int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from, - const nodemask_t *to, unsigned long flags); + unsigned long addr, pgoff_t *ilx); #endif - /* called by sys_remap_file_pages() to populate non-linear mapping */ - int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff); +#ifdef CONFIG_FIND_NORMAL_PAGE + /* + * Called by vm_normal_page() for special PTEs in @vma at @addr. This + * allows for returning a "normal" page from vm_normal_page() even + * though the PTE indicates that the "struct page" either does not exist + * or should not be touched: "special". + * + * Do not add new users: this really only works when a "normal" page + * was mapped, but then the PTE got changed to something weird (+ + * marked special) that would not make pte_pfn() identify the originally + * inserted page. + */ + struct page *(*find_normal_page)(struct vm_area_struct *vma, + unsigned long addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ }; -struct mmu_gather; -struct inode; +#ifdef CONFIG_NUMA_BALANCING +static inline void vma_numab_state_init(struct vm_area_struct *vma) +{ + vma->numab_state = NULL; +} +static inline void vma_numab_state_free(struct vm_area_struct *vma) +{ + kfree(vma->numab_state); +} +#else +static inline void vma_numab_state_init(struct vm_area_struct *vma) {} +static inline void vma_numab_state_free(struct vm_area_struct *vma) {} +#endif /* CONFIG_NUMA_BALANCING */ -#define page_private(page) ((page)->private) -#define set_page_private(page, v) ((page)->private = (v)) +/* + * These must be here rather than mmap_lock.h as dependent on vm_fault type, + * declared in this header. + */ +#ifdef CONFIG_PER_VMA_LOCK +static inline void release_fault_lock(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + vma_end_read(vmf->vma); + else + mmap_read_unlock(vmf->vma->vm_mm); +} -/* It's valid only if the page is free path or free_list */ -static inline void set_freepage_migratetype(struct page *page, int migratetype) +static inline void assert_fault_locked(const struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + vma_assert_locked(vmf->vma); + else + mmap_assert_locked(vmf->vma->vm_mm); +} +#else +static inline void release_fault_lock(struct vm_fault *vmf) { - page->index = migratetype; + mmap_read_unlock(vmf->vma->vm_mm); } -/* It's valid only if the page is free path or free_list */ -static inline int get_freepage_migratetype(struct page *page) +static inline void assert_fault_locked(const struct vm_fault *vmf) { - return page->index; + mmap_assert_locked(vmf->vma->vm_mm); +} +#endif /* CONFIG_PER_VMA_LOCK */ + +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) +{ + return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) +{ + return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +static inline void mm_flags_set(int flag, struct mm_struct *mm) +{ + set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +static inline void mm_flags_clear(int flag, struct mm_struct *mm) +{ + clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + +static inline void mm_flags_clear_all(struct mm_struct *mm) +{ + bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS); +} + +extern const struct vm_operations_struct vma_dummy_vm_ops; + +static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) +{ + memset(vma, 0, sizeof(*vma)); + vma->vm_mm = mm; + vma->vm_ops = &vma_dummy_vm_ops; + INIT_LIST_HEAD(&vma->anon_vma_chain); + vma_lock_init(vma, false); +} + +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); } /* - * FIXME: take this include out, include page-flags.h in - * files which need it (119 of them) + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. */ -#include <linux/page-flags.h> -#include <linux/huge_mm.h> +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); + vma_assert_write_locked(vma); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + /* + * If VMA flags exist beyond the first system word, also clear these. It + * is assumed the write once behaviour is required only for the first + * system word. + */ + if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + + bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); + } + + vma_flags_overwrite_word_once(&vma->flags, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_set_word(&vma->flags, flags); +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); + vma_start_write(vma); + vma_flags_clear_word(&vma->flags, flags); +} /* - * Methods to modify the page usage count. - * - * What counts for a page usage: - * - cache mapping (page->mapping) - * - private data (page->private) - * - page mapped in a task's page tables, each mapping - * is counted separately - * - * Also, many kernel routines increase the page count before a critical - * routine so they can be sure the page doesn't go away from under them. + * Use only if VMA is not part of the VMA tree or has no other users and + * therefore needs no locking. */ +static inline void __vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} /* - * Drop a ref, return true if the refcount fell to zero (the page has no users) + * Use only when the order of set/clear operations is unimportant, otherwise + * use vm_flags_{set|clear} explicitly. */ -static inline int put_page_testzero(struct page *page) +static inline void vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) { - VM_BUG_ON(atomic_read(&page->_count) == 0); - return atomic_dec_and_test(&page->_count); + vma_start_write(vma); + __vm_flags_mod(vma, set, clear); +} + +static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, + vma_flag_t bit) +{ + const vm_flags_t mask = BIT((__force int)bit); + + /* Only specific flags are permitted */ + if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED))) + return false; + + return true; } /* - * Try to grab a ref unless the page has a refcount of zero, return false if - * that is the case. + * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific + * valid flags are allowed to do this. */ -static inline int get_page_unless_zero(struct page *page) +static inline void vma_flag_set_atomic(struct vm_area_struct *vma, + vma_flag_t bit) { - return atomic_inc_not_zero(&page->_count); -} + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); -extern int page_is_ram(unsigned long pfn); + /* mmap read lock/VMA read lock must be held. */ + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) + vma_assert_locked(vma); -/* Support for virtually mapped pages */ -struct page *vmalloc_to_page(const void *addr); -unsigned long vmalloc_to_pfn(const void *addr); + if (__vma_flag_atomic_valid(vma, bit)) + set_bit((__force int)bit, bitmap); +} /* - * Determine if an address is within the vmalloc range + * Test for VMA flag atomically. Requires no locks. Only specific valid flags + * are allowed to do this. * - * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there - * is no special casing required. + * This is necessarily racey, so callers must ensure that serialisation is + * achieved through some other means, or that races are permissible. */ -static inline int is_vmalloc_addr(const void *x) +static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, + vma_flag_t bit) { -#ifdef CONFIG_MMU - unsigned long addr = (unsigned long)x; + if (__vma_flag_atomic_valid(vma, bit)) + return test_bit((__force int)bit, &vma->vm_flags); - return addr >= VMALLOC_START && addr < VMALLOC_END; -#else - return 0; -#endif + return false; } -#ifdef CONFIG_MMU -extern int is_vmalloc_or_module_addr(const void *x); -#else -static inline int is_vmalloc_or_module_addr(const void *x) + +static inline void vma_set_anonymous(struct vm_area_struct *vma) { - return 0; + vma->vm_ops = NULL; } -#endif -static inline void compound_lock(struct page *page) +static inline bool vma_is_anonymous(struct vm_area_struct *vma) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON(PageSlab(page)); - bit_spin_lock(PG_compound_lock, &page->flags); -#endif + return !vma->vm_ops; } -static inline void compound_unlock(struct page *page) +/* + * Indicate if the VMA is a heap for the given task; for + * /proc/PID/maps that is the heap of the main task. + */ +static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON(PageSlab(page)); - bit_spin_unlock(PG_compound_lock, &page->flags); -#endif + return vma->vm_start < vma->vm_mm->brk && + vma->vm_end > vma->vm_mm->start_brk; } -static inline unsigned long compound_lock_irqsave(struct page *page) +/* + * Indicate if the VMA is a stack for the given task; for + * /proc/PID/maps that is the stack of the main task. + */ +static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) { - unsigned long uninitialized_var(flags); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - local_irq_save(flags); - compound_lock(page); -#endif - return flags; + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; } -static inline void compound_unlock_irqrestore(struct page *page, - unsigned long flags) +static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - compound_unlock(page); - local_irq_restore(flags); -#endif + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); + + if (!maybe_stack) + return false; + + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == + VM_STACK_INCOMPLETE_SETUP) + return true; + + return false; } -static inline struct page *compound_head(struct page *page) +static inline bool vma_is_foreign(const struct vm_area_struct *vma) { - if (unlikely(PageTail(page))) - return page->first_page; - return page; + if (!current->mm) + return true; + + if (current->mm != vma->vm_mm) + return true; + + return false; } -/* - * The atomic page->_mapcount, starts from -1: so that transitions - * both from it and to it can be tracked, using atomic_inc_and_test - * and atomic_add_negative(-1). - */ -static inline void page_mapcount_reset(struct page *page) +static inline bool vma_is_accessible(const struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_ACCESS_FLAGS; +} + +static inline bool is_shared_maywrite(vm_flags_t vm_flags) { - atomic_set(&(page)->_mapcount, -1); + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); } -static inline int page_mapcount(struct page *page) +static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) { - return atomic_read(&(page)->_mapcount) + 1; + return is_shared_maywrite(vma->vm_flags); } -static inline int page_count(struct page *page) +static inline +struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { - return atomic_read(&compound_head(page)->_count); + return mas_find(&vmi->mas, max - 1); } -static inline void get_huge_page_tail(struct page *page) +static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* - * __split_huge_page_refcount() cannot run - * from under us. + * Uses mas_find() to get the first VMA when the iterator starts. + * Calling mas_next() could skip the first entry. */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); + return mas_find(&vmi->mas, ULONG_MAX); +} + +static inline +struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) +{ + return mas_next_range(&vmi->mas, ULONG_MAX); } -extern bool __get_page_tail(struct page *page); -static inline void get_page(struct page *page) +static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) { - if (unlikely(PageTail(page))) - if (likely(__get_page_tail(page))) - return; - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. - */ - VM_BUG_ON(atomic_read(&page->_count) <= 0); - atomic_inc(&page->_count); + return mas_prev(&vmi->mas, 0); } -static inline struct page *virt_to_head_page(const void *x) +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) { - struct page *page = virt_to_page(x); - return compound_head(page); + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +/* Free any unused preallocations */ +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); } +static inline int vma_iter_bulk_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store(&vmi->mas, vma); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + vma_mark_attached(vma); + return 0; +} + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + +#define for_each_vma(__vmi, __vma) \ + while (((__vma) = vma_next(&(__vmi))) != NULL) + +/* The MM code likes to work with exclusive end addresses */ +#define for_each_vma_range(__vmi, __vma, __end) \ + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) + +#ifdef CONFIG_SHMEM /* - * Setup the page count before being freed into the page allocator for - * the first time (boot or memory hotplug) + * The vma_is_shmem is not inline because it is used only by slow + * paths in userfault. */ -static inline void init_page_count(struct page *page) +bool vma_is_shmem(const struct vm_area_struct *vma); +bool vma_is_anon_shmem(const struct vm_area_struct *vma); +#else +static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; } +#endif + +int vma_is_stack_for_current(const struct vm_area_struct *vma); + +/* flush_tlb_range() takes a vma, not a mm, and can care about flags */ +#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } + +struct mmu_gather; +struct inode; + +extern void prep_compound_page(struct page *page, unsigned int order); + +static inline unsigned int folio_large_order(const struct folio *folio) { - atomic_set(&page->_count, 1); + return folio->_flags_1 & 0xff; } +#ifdef NR_PAGES_IN_LARGE_FOLIO +static inline unsigned long folio_large_nr_pages(const struct folio *folio) +{ + return folio->_nr_pages; +} +#else +static inline unsigned long folio_large_nr_pages(const struct folio *folio) +{ + return 1L << folio_large_order(folio); +} +#endif + +/* + * compound_order() can be called without holding a reference, which means + * that niceties like page_folio() don't work. These callers should be + * prepared to handle wild return values. For example, PG_head may be + * set before the order is initialised, or this may be a tail page. + * See compaction.c for some good examples. + */ +static inline unsigned int compound_order(const struct page *page) +{ + const struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags.f)) + return 0; + return folio_large_order(folio); +} + +/** + * folio_order - The allocation order of a folio. + * @folio: The folio. + * + * A folio is composed of 2^order pages. See get_order() for the definition + * of order. + * + * Return: The order of the folio. + */ +static inline unsigned int folio_order(const struct folio *folio) +{ + if (!folio_test_large(folio)) + return 0; + return folio_large_order(folio); +} + +/** + * folio_reset_order - Reset the folio order and derived _nr_pages + * @folio: The folio. + * + * Reset the order and derived _nr_pages to 0. Must only be used in the + * process of splitting large folios. + */ +static inline void folio_reset_order(struct folio *folio) +{ + if (WARN_ON_ONCE(!folio_test_large(folio))) + return; + folio->_flags_1 &= ~0xffUL; +#ifdef NR_PAGES_IN_LARGE_FOLIO + folio->_nr_pages = 0; +#endif +} + +#include <linux/huge_mm.h> + /* - * PageBuddy() indicate that the page is free and in the buddy system - * (see mm/page_alloc.c). + * Methods to modify the page usage count. + * + * What counts for a page usage: + * - cache mapping (page->mapping) + * - private data (page->private) + * - page mapped in a task's page tables, each mapping + * is counted separately * - * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to - * -2 so that an underflow of the page_mapcount() won't be mistaken - * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very - * efficiently by most CPU architectures. + * Also, many kernel routines increase the page count before a critical + * routine so they can be sure the page doesn't go away from under them. */ -#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) -static inline int PageBuddy(struct page *page) +/* + * Drop a ref, return true if the refcount fell to zero (the page has no users) + */ +static inline int put_page_testzero(struct page *page) { - return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE; + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); + return page_ref_dec_and_test(page); } -static inline void __SetPageBuddy(struct page *page) +static inline int folio_put_testzero(struct folio *folio) { - VM_BUG_ON(atomic_read(&page->_mapcount) != -1); - atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE); + return put_page_testzero(&folio->page); } -static inline void __ClearPageBuddy(struct page *page) +/* + * Try to grab a ref unless the page has a refcount of zero, return false if + * that is the case. + * This can be called when MMU is off so it must not access + * any of the virtual mappings. + */ +static inline bool get_page_unless_zero(struct page *page) { - VM_BUG_ON(!PageBuddy(page)); - atomic_set(&page->_mapcount, -1); + return page_ref_add_unless(page, 1, 0); } -void put_page(struct page *page); -void put_pages_list(struct list_head *pages); +static inline struct folio *folio_get_nontail_page(struct page *page) +{ + if (unlikely(!get_page_unless_zero(page))) + return NULL; + return (struct folio *)page; +} -void split_page(struct page *page, unsigned int order); -int split_free_page(struct page *page); +extern int page_is_ram(unsigned long pfn); + +enum { + REGION_INTERSECTS, + REGION_DISJOINT, + REGION_MIXED, +}; + +int region_intersects(resource_size_t offset, size_t size, unsigned long flags, + unsigned long desc); + +/* Support for virtually mapped pages */ +struct page *vmalloc_to_page(const void *addr); +unsigned long vmalloc_to_pfn(const void *addr); /* - * Compound pages have a destructor function. Provide a - * prototype for that function and accessor functions. - * These are _only_ valid on the head of a PG_compound page. + * Determine if an address is within the vmalloc range + * + * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there + * is no special casing required. */ -typedef void compound_page_dtor(struct page *); +#ifdef CONFIG_MMU +extern bool is_vmalloc_addr(const void *x); +extern int is_vmalloc_or_module_addr(const void *x); +#else +static inline bool is_vmalloc_addr(const void *x) +{ + return false; +} +static inline int is_vmalloc_or_module_addr(const void *x) +{ + return 0; +} +#endif -static inline void set_compound_page_dtor(struct page *page, - compound_page_dtor *dtor) +/* + * How many times the entire folio is mapped as a single unit (eg by a + * PMD or PUD entry). This is probably not what you want, except for + * debugging purposes or implementation of other core folio_*() primitives. + */ +static inline int folio_entire_mapcount(const struct folio *folio) { - page[1].lru.next = (void *)dtor; + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1)) + return 0; + return atomic_read(&folio->_entire_mapcount) + 1; } -static inline compound_page_dtor *get_compound_page_dtor(struct page *page) +static inline int folio_large_mapcount(const struct folio *folio) { - return (compound_page_dtor *)page[1].lru.next; + VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); + return atomic_read(&folio->_large_mapcount) + 1; } -static inline int compound_order(struct page *page) +/** + * folio_mapcount() - Number of mappings of this folio. + * @folio: The folio. + * + * The folio mapcount corresponds to the number of present user page table + * entries that reference any part of a folio. Each such present user page + * table entry must be paired with exactly on folio reference. + * + * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts + * exactly once. + * + * For hugetlb folios, each abstracted "hugetlb" user page table entry that + * references the entire folio counts exactly once, even when such special + * page table entries are comprised of multiple ordinary page table entries. + * + * Will report 0 for pages which cannot be mapped into userspace, such as + * slab, page tables and similar. + * + * Return: The number of times this folio is mapped. + */ +static inline int folio_mapcount(const struct folio *folio) { - if (!PageHead(page)) - return 0; - return (unsigned long)page[1].lru.prev; + int mapcount; + + if (likely(!folio_test_large(folio))) { + mapcount = atomic_read(&folio->_mapcount) + 1; + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + return mapcount; + } + return folio_large_mapcount(folio); } -static inline int compound_trans_order(struct page *page) +/** + * folio_mapped - Is this folio mapped into userspace? + * @folio: The folio. + * + * Return: True if any page in this folio is referenced by user page tables. + */ +static inline bool folio_mapped(const struct folio *folio) { - int order; - unsigned long flags; + return folio_mapcount(folio) >= 1; +} - if (!PageHead(page)) - return 0; +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any sub-page of compound page is mapped, + * even if this particular sub-page is not itself mapped by any PTE or PMD. + */ +static inline bool page_mapped(const struct page *page) +{ + return folio_mapped(page_folio(page)); +} + +static inline struct page *virt_to_head_page(const void *x) +{ + struct page *page = virt_to_page(x); + + return compound_head(page); +} + +static inline struct folio *virt_to_folio(const void *x) +{ + struct page *page = virt_to_page(x); + + return page_folio(page); +} + +void __folio_put(struct folio *folio); + +void split_page(struct page *page, unsigned int order); +void folio_copy(struct folio *dst, struct folio *src); +int folio_mc_copy(struct folio *dst, struct folio *src); - flags = compound_lock_irqsave(page); - order = compound_order(page); - compound_unlock_irqrestore(page, flags); - return order; +unsigned long nr_free_buffer_pages(void); + +/* Returns the number of bytes in this potentially compound page. */ +static inline unsigned long page_size(const struct page *page) +{ + return PAGE_SIZE << compound_order(page); +} + +/* Returns the number of bits needed for the number of bytes in a page */ +static inline unsigned int page_shift(struct page *page) +{ + return PAGE_SHIFT + compound_order(page); } -static inline void set_compound_order(struct page *page, unsigned long order) +/** + * thp_order - Order of a transparent huge page. + * @page: Head page of a transparent huge page. + */ +static inline unsigned int thp_order(struct page *page) { - page[1].lru.prev = (void *)order; + VM_BUG_ON_PGFLAGS(PageTail(page), page); + return compound_order(page); +} + +/** + * thp_size - Size of a transparent huge page. + * @page: Head page of a transparent huge page. + * + * Return: Number of bytes in this page. + */ +static inline unsigned long thp_size(struct page *page) +{ + return PAGE_SIZE << thp_order(page); } #ifdef CONFIG_MMU @@ -518,9 +1485,15 @@ static inline void set_compound_order(struct page *page, unsigned long order) static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); + pte = pte_mkwrite(pte, vma); return pte; } + +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page); +void set_pte_range(struct vm_fault *vmf, struct folio *folio, + struct page *page, unsigned int nr, unsigned long addr); + +vm_fault_t finish_fault(struct vm_fault *vmf); #endif /* @@ -557,9 +1530,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * - * A page may belong to an inode's memory mapping. In this case, page->mapping - * is the pointer to the inode, and page->index is the file offset of the page, - * in units of PAGE_CACHE_SIZE. + * A folio may belong to an inode's memory mapping. In this case, + * folio->mapping points to the inode, and folio->index is the file + * offset of the folio, in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that @@ -570,7 +1543,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is - * rooted at mapping->page_tree, and indexed by offset. + * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * @@ -583,165 +1556,686 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * back into memory. */ -/* - * The zone field is never updated after free_area_init_core() - * sets it, so none of the operations on it need to be atomic. +/* 127: arbitrary random number, small enough to assemble well */ +#define folio_ref_zero_or_close_to_overflow(folio) \ + ((unsigned int) folio_ref_count(folio) + 127u <= 127u) + +/** + * folio_get - Increment the reference count on a folio. + * @folio: The folio. + * + * Context: May be called in any context, as long as you know that + * you have a refcount on the folio. If you do not already have one, + * folio_try_get() may be the right interface for you to use. */ +static inline void folio_get(struct folio *folio) +{ + VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); + folio_ref_inc(folio); +} -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ -#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) -#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) -#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) +static inline void get_page(struct page *page) +{ + struct folio *folio = page_folio(page); + if (WARN_ON_ONCE(folio_test_slab(folio))) + return; + if (WARN_ON_ONCE(folio_test_large_kmalloc(folio))) + return; + folio_get(folio); +} + +static inline __must_check bool try_get_page(struct page *page) +{ + page = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(page) <= 0)) + return false; + page_ref_inc(page); + return true; +} + +/** + * folio_put - Decrement the reference count on a folio. + * @folio: The folio. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put() unless you can be sure that it wasn't the + * last reference. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put(struct folio *folio) +{ + if (folio_put_testzero(folio)) + __folio_put(folio); +} + +/** + * folio_put_refs - Reduce the reference count on a folio. + * @folio: The folio. + * @refs: The amount to subtract from the folio's reference count. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put_refs() unless you can be sure that these weren't + * the last references. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put_refs(struct folio *folio, int refs) +{ + if (folio_ref_sub_and_test(folio, refs)) + __folio_put(folio); +} + +void folios_put_refs(struct folio_batch *folios, unsigned int *refs); /* - * Define the bit shifts to access each section. For non-existent - * sections we define the shift as 0; that plus a 0 mask ensures - * the compiler will optimise away reference to them. + * union release_pages_arg - an array of pages or folios + * + * release_pages() releases a simple array of multiple pages, and + * accepts various different forms of said page array: either + * a regular old boring array of pages, an array of folios, or + * an array of encoded page pointers. + * + * The transparent union syntax for this kind of "any of these + * argument types" is all kinds of ugly, so look away. */ -#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) -#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) -#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) +typedef union { + struct page **pages; + struct folio **folios; + struct encoded_page **encoded_pages; +} release_pages_arg __attribute__ ((__transparent_union__)); -/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ -#ifdef NODE_NOT_IN_PAGE_FLAGS -#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ - SECTIONS_PGOFF : ZONES_PGOFF) -#else -#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ - NODES_PGOFF : ZONES_PGOFF) -#endif +void release_pages(release_pages_arg, int nr); -#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) +/** + * folios_put - Decrement the reference count on an array of folios. + * @folios: The folios. + * + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need to + * reinitialise it. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folios_put(struct folio_batch *folios) +{ + folios_put_refs(folios, NULL); +} -#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS -#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS -#endif +static inline void put_page(struct page *page) +{ + struct folio *folio = page_folio(page); + + if (folio_test_slab(folio) || folio_test_large_kmalloc(folio)) + return; -#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) -#define NODES_MASK ((1UL << NODES_WIDTH) - 1) -#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) -#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) + folio_put(folio); +} -static inline enum zone_type page_zonenum(const struct page *page) +/* + * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload + * the page's refcount so that two separate items are tracked: the original page + * reference count, and also a new count of how many pin_user_pages() calls were + * made against the page. ("gup-pinned" is another term for the latter). + * + * With this scheme, pin_user_pages() becomes special: such pages are marked as + * distinct from normal pages. As such, the unpin_user_page() call (and its + * variants) must be used in order to release gup-pinned pages. + * + * Choice of value: + * + * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference + * counts with respect to pin_user_pages() and unpin_user_page() becomes + * simpler, due to the fact that adding an even power of two to the page + * refcount has the effect of using only the upper N bits, for the code that + * counts up using the bias value. This means that the lower bits are left for + * the exclusive use of the original code that increments and decrements by one + * (or at least, by much smaller values than the bias value). + * + * Of course, once the lower bits overflow into the upper bits (and this is + * OK, because subtraction recovers the original values), then visual inspection + * no longer suffices to directly view the separate counts. However, for normal + * applications that don't have huge page reference counts, this won't be an + * issue. + * + * Locking: the lockless algorithm described in folio_try_get_rcu() + * provides safe operation for get_user_pages(), folio_mkclean() and + * other calls that race to set up page table entries. + */ +#define GUP_PIN_COUNTING_BIAS (1U << 10) + +void unpin_user_page(struct page *page); +void unpin_folio(struct folio *folio); +void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty); +void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, + bool make_dirty); +void unpin_user_pages(struct page **pages, unsigned long npages); +void unpin_user_folio(struct folio *folio, unsigned long npages); +void unpin_folios(struct folio **folios, unsigned long nfolios); + +static inline bool is_cow_mapping(vm_flags_t flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + +#ifndef CONFIG_MMU +static inline bool is_nommu_shared_mapping(vm_flags_t flags) { - return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; + /* + * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected + * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of + * a file mapping. R/O MAP_PRIVATE mappings might still modify + * underlying memory if ptrace is active, so this is only possible if + * ptrace does not apply. Note that there is no mprotect() to upgrade + * write permissions later. + */ + return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } +#endif #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif /* - * The identification function is only used by the buddy allocator for - * determining if two pages could be buddies. We are not really - * identifying a zone since we could be using a the section number - * id if we have not node id available in page flags. - * We guarantee only that it will return the same value for two - * combinable pages in a zone. + * The identification function is mainly used by the buddy allocator for + * determining if two pages could be buddies. We are not really identifying + * the zone since we could be using the section number id if we do not have + * node id available in page flags. + * We only guarantee that it will return the same value for two combinable + * pages in a zone. */ static inline int page_zone_id(struct page *page) { - return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; + return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK; } -static inline int zone_to_nid(struct zone *zone) -{ -#ifdef CONFIG_NUMA - return zone->node; +#ifdef NODE_NOT_IN_PAGE_FLAGS +int memdesc_nid(memdesc_flags_t mdf); #else - return 0; -#endif +static inline int memdesc_nid(memdesc_flags_t mdf) +{ + return (mdf.f >> NODES_PGSHIFT) & NODES_MASK; } +#endif -#ifdef NODE_NOT_IN_PAGE_FLAGS -extern int page_to_nid(const struct page *page); -#else static inline int page_to_nid(const struct page *page) { - return (page->flags >> NODES_PGSHIFT) & NODES_MASK; + return memdesc_nid(PF_POISONED_CHECK(page)->flags); +} + +static inline int folio_nid(const struct folio *folio) +{ + return memdesc_nid(folio->flags); } -#endif #ifdef CONFIG_NUMA_BALANCING -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -static inline int page_nid_xchg_last(struct page *page, int nid) +/* page access time bits needs to hold at least 4 seconds */ +#define PAGE_ACCESS_TIME_MIN_BITS 12 +#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS +#define PAGE_ACCESS_TIME_BUCKETS \ + (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) +#else +#define PAGE_ACCESS_TIME_BUCKETS 0 +#endif + +#define PAGE_ACCESS_TIME_MASK \ + (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) + +static inline int cpu_pid_to_cpupid(int cpu, int pid) { - return xchg(&page->_last_nid, nid); + return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } -static inline int page_nid_last(struct page *page) +static inline int cpupid_to_pid(int cpupid) { - return page->_last_nid; + return cpupid & LAST__PID_MASK; } -static inline void page_nid_reset_last(struct page *page) + +static inline int cpupid_to_cpu(int cpupid) { - page->_last_nid = -1; + return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } -#else -static inline int page_nid_last(struct page *page) + +static inline int cpupid_to_nid(int cpupid) { - return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; + return cpu_to_node(cpupid_to_cpu(cpupid)); } -extern int page_nid_xchg_last(struct page *page, int nid); +static inline bool cpupid_pid_unset(int cpupid) +{ + return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); +} -static inline void page_nid_reset_last(struct page *page) +static inline bool cpupid_cpu_unset(int cpupid) { - int nid = (1 << LAST_NID_SHIFT) - 1; + return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); +} - page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; +static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) +{ + return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); +} + +#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) +{ + return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK); +} + +static inline int folio_last_cpupid(struct folio *folio) +{ + return folio->_last_cpupid; +} +static inline void page_cpupid_reset_last(struct page *page) +{ + page->_last_cpupid = -1 & LAST_CPUPID_MASK; } -#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ #else -static inline int page_nid_xchg_last(struct page *page, int nid) +static inline int folio_last_cpupid(struct folio *folio) +{ + return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; +} + +int folio_xchg_last_cpupid(struct folio *folio, int cpupid); + +static inline void page_cpupid_reset_last(struct page *page) { - return page_to_nid(page); + page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } +#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ -static inline int page_nid_last(struct page *page) +static inline int folio_xchg_access_time(struct folio *folio, int time) { - return page_to_nid(page); + int last_time; + + last_time = folio_xchg_last_cpupid(folio, + time >> PAGE_ACCESS_TIME_BUCKETS); + return last_time << PAGE_ACCESS_TIME_BUCKETS; } -static inline void page_nid_reset_last(struct page *page) +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { + unsigned int pid_bit; + + pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); + if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { + __set_bit(pid_bit, &vma->numab_state->pids_active[1]); + } +} + +bool folio_use_access_time(struct folio *folio); +#else /* !CONFIG_NUMA_BALANCING */ +static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) +{ + return folio_nid(folio); /* XXX */ +} + +static inline int folio_xchg_access_time(struct folio *folio, int time) +{ + return 0; } -#endif + +static inline int folio_last_cpupid(struct folio *folio) +{ + return folio_nid(folio); /* XXX */ +} + +static inline int cpupid_to_nid(int cpupid) +{ + return -1; +} + +static inline int cpupid_to_pid(int cpupid) +{ + return -1; +} + +static inline int cpupid_to_cpu(int cpupid) +{ + return -1; +} + +static inline int cpu_pid_to_cpupid(int nid, int pid) +{ + return -1; +} + +static inline bool cpupid_pid_unset(int cpupid) +{ + return true; +} + +static inline void page_cpupid_reset_last(struct page *page) +{ +} + +static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) +{ + return false; +} + +static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) +{ +} +static inline bool folio_use_access_time(struct folio *folio) +{ + return false; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +/* + * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid + * setting tags for all pages to native kernel tag value 0xff, as the default + * value 0x00 maps to 0xff. + */ + +static inline u8 page_kasan_tag(const struct page *page) +{ + u8 tag = KASAN_TAG_KERNEL; + + if (kasan_enabled()) { + tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + tag ^= 0xff; + } + + return tag; +} + +static inline void page_kasan_tag_set(struct page *page, u8 tag) +{ + unsigned long old_flags, flags; + + if (!kasan_enabled()) + return; + + tag ^= 0xff; + old_flags = READ_ONCE(page->flags.f); + do { + flags = old_flags; + flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); + flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; + } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags))); +} + +static inline void page_kasan_tag_reset(struct page *page) +{ + if (kasan_enabled()) + page_kasan_tag_set(page, KASAN_TAG_KERNEL); +} + +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +static inline u8 page_kasan_tag(const struct page *page) +{ + return 0xff; +} + +static inline void page_kasan_tag_set(struct page *page, u8 tag) { } +static inline void page_kasan_tag_reset(struct page *page) { } + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } +static inline pg_data_t *page_pgdat(const struct page *page) +{ + return NODE_DATA(page_to_nid(page)); +} + +static inline pg_data_t *folio_pgdat(const struct folio *folio) +{ + return NODE_DATA(folio_nid(folio)); +} + +static inline struct zone *folio_zone(const struct folio *folio) +{ + return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)]; +} + #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { - page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); - page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; + page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); + page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; +} + +static inline unsigned long memdesc_section(memdesc_flags_t mdf) +{ + return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; +} +#else /* !SECTION_IN_PAGE_FLAGS */ +static inline unsigned long memdesc_section(memdesc_flags_t mdf) +{ + return 0; +} +#endif /* SECTION_IN_PAGE_FLAGS */ + +/** + * folio_pfn - Return the Page Frame Number of a folio. + * @folio: The folio. + * + * A folio may contain multiple pages. The pages have consecutive + * Page Frame Numbers. + * + * Return: The Page Frame Number of the first page in the folio. + */ +static inline unsigned long folio_pfn(const struct folio *folio) +{ + return page_to_pfn(&folio->page); +} + +static inline struct folio *pfn_folio(unsigned long pfn) +{ + return page_folio(pfn_to_page(pfn)); +} + +#ifdef CONFIG_MMU +static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot) +{ + return pfn_pte(page_to_pfn(page), pgprot); +} + +/** + * folio_mk_pte - Create a PTE for this folio + * @folio: The folio to create a PTE for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_ptes(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot) +{ + return pfn_pte(folio_pfn(folio), pgprot); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/** + * folio_mk_pmd - Create a PMD for this folio + * @folio: The folio to create a PMD for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_pmd_at(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot) +{ + return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +/** + * folio_mk_pud - Create a PUD for this folio + * @folio: The folio to create a PUD for + * @pgprot: The page protection bits to use + * + * Create a page table entry for the first page of this folio. + * This is suitable for passing to set_pud_at(). + * + * Return: A page table entry suitable for mapping this folio. + */ +static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot) +{ + return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot)); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* CONFIG_MMU */ + +static inline bool folio_has_pincount(const struct folio *folio) +{ + if (IS_ENABLED(CONFIG_64BIT)) + return folio_test_large(folio); + return folio_order(folio) > 1; } -static inline unsigned long page_to_section(const struct page *page) +/** + * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. + * @folio: The folio. + * + * This function checks if a folio has been pinned via a call to + * a function in the pin_user_pages() family. + * + * For small folios, the return value is partially fuzzy: false is not fuzzy, + * because it means "definitely not pinned for DMA", but true means "probably + * pinned for DMA, but possibly a false positive due to having at least + * GUP_PIN_COUNTING_BIAS worth of normal folio references". + * + * False positives are OK, because: a) it's unlikely for a folio to + * get that many refcounts, and b) all the callers of this routine are + * expected to be able to deal gracefully with a false positive. + * + * For most large folios, the result will be exactly correct. That's because + * we have more tracking data available: the _pincount field is used + * instead of the GUP_PIN_COUNTING_BIAS scheme. + * + * For more information, please see Documentation/core-api/pin_user_pages.rst. + * + * Return: True, if it is likely that the folio has been "dma-pinned". + * False, if the folio is definitely not dma-pinned. + */ +static inline bool folio_maybe_dma_pinned(struct folio *folio) +{ + if (folio_has_pincount(folio)) + return atomic_read(&folio->_pincount) > 0; + + /* + * folio_ref_count() is signed. If that refcount overflows, then + * folio_ref_count() returns a negative value, and callers will avoid + * further incrementing the refcount. + * + * Here, for that overflow case, use the sign bit to count a little + * bit higher via unsigned math, and thus still get an accurate result. + */ + return ((unsigned int)folio_ref_count(folio)) >= + GUP_PIN_COUNTING_BIAS; +} + +/* + * This should most likely only be called during fork() to see whether we + * should break the cow immediately for an anon page on the src mm. + * + * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. + */ +static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, + struct folio *folio) { - return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; + VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); + + if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)) + return false; + + return folio_maybe_dma_pinned(folio); +} + +/** + * is_zero_page - Query if a page is a zero page + * @page: The page to query + * + * This returns true if @page is one of the permanent zero pages. + */ +static inline bool is_zero_page(const struct page *page) +{ + return is_zero_pfn(page_to_pfn(page)); +} + +/** + * is_zero_folio - Query if a folio is a zero page + * @folio: The folio to query + * + * This returns true if @folio is one of the permanent zero pages. + */ +static inline bool is_zero_folio(const struct folio *folio) +{ + return is_zero_page(&folio->page); +} + +/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ +#ifdef CONFIG_MIGRATION +static inline bool folio_is_longterm_pinnable(struct folio *folio) +{ +#ifdef CONFIG_CMA + int mt = folio_migratetype(folio); + + if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) + return false; +#endif + /* The zero page can be "pinned" but gets special handling. */ + if (is_zero_folio(folio)) + return true; + + /* Coherent device memory must always allow eviction. */ + if (folio_is_device_coherent(folio)) + return false; + + /* + * Filesystems can only tolerate transient delays to truncate and + * hole-punch operations + */ + if (folio_is_fsdax(folio)) + return false; + + /* Otherwise, non-movable zone folios can be pinned. */ + return !folio_is_zone_movable(folio); + +} +#else +static inline bool folio_is_longterm_pinnable(struct folio *folio) +{ + return true; } #endif static inline void set_page_zone(struct page *page, enum zone_type zone) { - page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); - page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; + page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT); + page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { - page->flags &= ~(NODES_MASK << NODES_PGSHIFT); - page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; + page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT); + page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, @@ -754,393 +2248,657 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } -/* - * Some inline functions in vmstat.h depend on page_zone() +/** + * folio_nr_pages - The number of pages in the folio. + * @folio: The folio. + * + * Return: A positive power of two. */ -#include <linux/vmstat.h> - -static __always_inline void *lowmem_page_address(const struct page *page) +static inline unsigned long folio_nr_pages(const struct folio *folio) { - return __va(PFN_PHYS(page_to_pfn(page))); + if (!folio_test_large(folio)) + return 1; + return folio_large_nr_pages(folio); } -#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) -#define HASHED_PAGE_VIRTUAL +#if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) +/* + * We don't expect any folios that exceed buddy sizes (and consequently + * memory sections). + */ +#define MAX_FOLIO_ORDER MAX_PAGE_ORDER +#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/* + * Only pages within a single memory section are guaranteed to be + * contiguous. By limiting folios to a single memory section, all folio + * pages are guaranteed to be contiguous. + */ +#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT +#elif defined(CONFIG_HUGETLB_PAGE) +/* + * There is no real limit on the folio size. We limit them to the maximum we + * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect + * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. + */ +#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) +#else +/* + * Without hugetlb, gigantic folios that are bigger than a single PUD are + * currently impossible. + */ +#define MAX_FOLIO_ORDER PUD_ORDER #endif -#if defined(WANT_PAGE_VIRTUAL) -#define page_address(page) ((page)->virtual) -#define set_page_address(page, address) \ - do { \ - (page)->virtual = (address); \ - } while(0) -#define page_address_init() do { } while(0) -#endif +#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) -#if defined(HASHED_PAGE_VIRTUAL) -void *page_address(const struct page *page); -void set_page_address(struct page *page, void *virtual); -void page_address_init(void); -#endif +/* + * compound_nr() returns the number of pages in this potentially compound + * page. compound_nr() can be called on a tail page, and is defined to + * return 1 in that case. + */ +static inline unsigned long compound_nr(const struct page *page) +{ + const struct folio *folio = (struct folio *)page; -#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) -#define page_address(page) lowmem_page_address(page) -#define set_page_address(page, address) do { } while(0) -#define page_address_init() do { } while(0) -#endif + if (!test_bit(PG_head, &folio->flags.f)) + return 1; + return folio_large_nr_pages(folio); +} -/* - * On an anonymous page mapped into a user virtual memory area, - * page->mapping points to its anon_vma, not to a struct address_space; - * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. - * - * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, - * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; - * and then page->mapping points, not to an anon_vma, but to a private - * structure which KSM associates with that merged page. See ksm.h. +/** + * folio_next - Move to the next physical folio. + * @folio: The folio we're currently operating on. * - * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. + * If you have physically contiguous memory which may span more than + * one folio (eg a &struct bio_vec), use this function to move from one + * folio to the next. Do not use it if the memory is only virtually + * contiguous as the folios are almost certainly not adjacent to each + * other. This is the folio equivalent to writing ``page++``. * - * Please note that, confusingly, "page_mapping" refers to the inode - * address_space which maps the page from disk; whereas "page_mapped" - * refers to user virtual address space into which the page is mapped. + * Context: We assume that the folios are refcounted and/or locked at a + * higher level and do not adjust the reference counts. + * Return: The next struct folio. */ -#define PAGE_MAPPING_ANON 1 -#define PAGE_MAPPING_KSM 2 -#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) - -extern struct address_space *page_mapping(struct page *page); - -/* Neutral page->mapping pointer to address_space or anon_vma or other */ -static inline void *page_rmapping(struct page *page) +static inline struct folio *folio_next(struct folio *folio) { - return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); + return (struct folio *)folio_page(folio, folio_nr_pages(folio)); } -extern struct address_space *__page_file_mapping(struct page *); - -static inline -struct address_space *page_file_mapping(struct page *page) +/** + * folio_shift - The size of the memory described by this folio. + * @folio: The folio. + * + * A folio represents a number of bytes which is a power-of-two in size. + * This function tells you which power-of-two the folio is. See also + * folio_size() and folio_order(). + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The base-2 logarithm of the size of this folio. + */ +static inline unsigned int folio_shift(const struct folio *folio) { - if (unlikely(PageSwapCache(page))) - return __page_file_mapping(page); - - return page->mapping; + return PAGE_SHIFT + folio_order(folio); } -static inline int PageAnon(struct page *page) +/** + * folio_size - The number of bytes in a folio. + * @folio: The folio. + * + * Context: The caller should have a reference on the folio to prevent + * it from being split. It is not necessary for the folio to be locked. + * Return: The number of bytes in this folio. + */ +static inline size_t folio_size(const struct folio *folio) { - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; + return PAGE_SIZE << folio_order(folio); } -/* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use ->private +/** + * folio_maybe_mapped_shared - Whether the folio is mapped into the page + * tables of more than one MM + * @folio: The folio. + * + * This function checks if the folio maybe currently mapped into more than one + * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single + * MM ("mapped exclusively"). + * + * For KSM folios, this function also returns "mapped shared" when a folio is + * mapped multiple times into the same MM, because the individual page mappings + * are independent. + * + * For small anonymous folios and anonymous hugetlb folios, the return + * value will be exactly correct: non-KSM folios can only be mapped at most once + * into an MM, and they cannot be partially mapped. KSM folios are + * considered shared even if mapped multiple times into the same MM. + * + * For other folios, the result can be fuzzy: + * #. For partially-mappable large folios (THP), the return value can wrongly + * indicate "mapped shared" (false positive) if a folio was mapped by + * more than two MMs at one point in time. + * #. For pagecache folios (including hugetlb), the return value can wrongly + * indicate "mapped shared" (false positive) when two VMAs in the same MM + * cover the same file range. + * + * Further, this function only considers current page table mappings that + * are tracked using the folio mapcount(s). + * + * This function does not consider: + * #. If the folio might get mapped in the (near) future (e.g., swapcache, + * pagecache, temporary unmapping for migration). + * #. If the folio is mapped differently (VM_PFNMAP). + * #. If hugetlb page table sharing applies. Callers might want to check + * hugetlb_pmd_shared(). + * + * Return: Whether the folio is estimated to be mapped into more than one MM. */ -static inline pgoff_t page_index(struct page *page) +static inline bool folio_maybe_mapped_shared(struct folio *folio) { - if (unlikely(PageSwapCache(page))) - return page_private(page); - return page->index; -} + int mapcount = folio_mapcount(folio); -extern pgoff_t __page_file_index(struct page *page); + /* Only partially-mappable folios require more care. */ + if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) + return mapcount > 1; -/* - * Return the file index of the page. Regular pagecache pages use ->index - * whereas swapcache pages use swp_offset(->private) + /* + * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ... + * simply assume "mapped shared", nobody should really care + * about this for arbitrary kernel allocations. + */ + if (!IS_ENABLED(CONFIG_MM_ID)) + return true; + + /* + * A single mapping implies "mapped exclusively", even if the + * folio flag says something different: it's easier to handle this + * case here instead of on the RMAP hot path. + */ + if (mapcount <= 1) + return false; + return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids); +} + +/** + * folio_expected_ref_count - calculate the expected folio refcount + * @folio: the folio + * + * Calculate the expected folio refcount, taking references from the pagecache, + * swapcache, PG_private and page table mappings into account. Useful in + * combination with folio_ref_count() to detect unexpected references (e.g., + * GUP or other temporary references). + * + * Does currently not consider references from the LRU cache. If the folio + * was isolated from the LRU (which is the case during migration or split), + * the LRU cache does not apply. + * + * Calling this function on an unmapped folio -- !folio_mapped() -- that is + * locked will return a stable result. + * + * Calling this function on a mapped folio will not result in a stable result, + * because nothing stops additional page table mappings from coming (e.g., + * fork()) or going (e.g., munmap()). + * + * Calling this function without the folio lock will also not result in a + * stable result: for example, the folio might get dropped from the swapcache + * concurrently. + * + * However, even when called without the folio lock or on a mapped folio, + * this function can be used to detect unexpected references early (for example, + * if it makes sense to even lock the folio and unmap it). + * + * The caller must add any reference (e.g., from folio_try_get()) it might be + * holding itself to the result. + * + * Returns the expected folio refcount. */ -static inline pgoff_t page_file_index(struct page *page) +static inline int folio_expected_ref_count(const struct folio *folio) { - if (unlikely(PageSwapCache(page))) - return __page_file_index(page); + const int order = folio_order(folio); + int ref_count = 0; + + if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio))) + return 0; - return page->index; + if (folio_test_anon(folio)) { + /* One reference per page from the swapcache. */ + ref_count += folio_test_swapcache(folio) << order; + } else { + /* One reference per page from the pagecache. */ + ref_count += !!folio->mapping << order; + /* One reference from PG_private. */ + ref_count += folio_test_private(folio); + } + + /* One reference per page table mapping. */ + return ref_count + folio_mapcount(folio); } -/* - * Return true if this page is mapped into pagetables. - */ -static inline int page_mapped(struct page *page) +#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE +static inline int arch_make_folio_accessible(struct folio *folio) { - return atomic_read(&(page)->_mapcount) >= 0; + return 0; } +#endif /* - * Different kinds of faults, as returned by handle_mm_fault(). - * Used to decide whether a process gets delivered SIGBUS or - * just gets major/minor fault counters bumped up. + * Some inline functions in vmstat.h depend on page_zone() */ +#include <linux/vmstat.h> -#define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */ +#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) +#define HASHED_PAGE_VIRTUAL +#endif -#define VM_FAULT_OOM 0x0001 -#define VM_FAULT_SIGBUS 0x0002 -#define VM_FAULT_MAJOR 0x0004 -#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ -#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ -#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ +#if defined(WANT_PAGE_VIRTUAL) +static inline void *page_address(const struct page *page) +{ + return page->virtual; +} +static inline void set_page_address(struct page *page, void *address) +{ + page->virtual = address; +} +#define page_address_init() do { } while(0) +#endif -#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ -#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ -#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ +#if defined(HASHED_PAGE_VIRTUAL) +void *page_address(const struct page *page); +void set_page_address(struct page *page, void *virtual); +void page_address_init(void); +#endif -#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ +static __always_inline void *lowmem_page_address(const struct page *page) +{ + return page_to_virt(page); +} -#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ - VM_FAULT_HWPOISON_LARGE) +#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) +#define page_address(page) lowmem_page_address(page) +#define set_page_address(page, address) do { } while(0) +#define page_address_init() do { } while(0) +#endif -/* Encode hstate index for a hwpoisoned large page */ -#define VM_FAULT_SET_HINDEX(x) ((x) << 12) -#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) +static inline void *folio_address(const struct folio *folio) +{ + return page_address(&folio->page); +} /* - * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. + * Return true only if the page has been allocated with + * ALLOC_NO_WATERMARKS and the low watermark was not + * met implying that the system is under some pressure. */ -extern void pagefault_out_of_memory(void); +static inline bool page_is_pfmemalloc(const struct page *page) +{ + /* + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. + */ + return (uintptr_t)page->lru.next & BIT(1); +} -#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) +/* + * Return true only if the folio has been allocated with + * ALLOC_NO_WATERMARKS and the low watermark was not + * met implying that the system is under some pressure. + */ +static inline bool folio_is_pfmemalloc(const struct folio *folio) +{ + /* + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. + */ + return (uintptr_t)folio->lru.next & BIT(1); +} /* - * Flags passed to show_mem() and show_free_areas() to suppress output in - * various contexts. + * Only to be called by the page allocator on a freshly allocated + * page. */ -#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -#define SHOW_MEM_FILTER_PAGE_COUNT (0x0002u) /* page type count */ +static inline void set_page_pfmemalloc(struct page *page) +{ + page->lru.next = (void *)BIT(1); +} -extern void show_free_areas(unsigned int flags); -extern bool skip_free_areas_node(unsigned int flags, int nid); +static inline void clear_page_pfmemalloc(struct page *page) +{ + page->lru.next = NULL; +} -int shmem_zero_setup(struct vm_area_struct *); +/* + * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. + */ +extern void pagefault_out_of_memory(void); -extern int can_do_mlock(void); -extern int user_shm_lock(size_t, struct user_struct *); -extern void user_shm_unlock(size_t, struct user_struct *); +#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) +#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { - struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ - struct address_space *check_mapping; /* Check page->mapping if set */ - pgoff_t first_index; /* Lowest page->index to unmap */ - pgoff_t last_index; /* Highest page->index to unmap */ + struct folio *single_folio; /* Locked folio to be unmapped */ + bool even_cows; /* Zap COWed private pages too? */ + bool reclaim_pt; /* Need reclaim page tables? */ + zap_flags_t zap_flags; /* Extra flags for zapping */ }; -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte); +/* + * Whether to drop the pte markers, for example, the uffd-wp information for + * file-backed memory. This should only be specified when we will completely + * drop the page in the mm, either by truncation or unmapping of the vma. By + * default, the flag is not set. + */ +#define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) +/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ +#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) -int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, - unsigned long size); -void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *); -void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long start, unsigned long end); +#ifdef CONFIG_MMU +extern bool can_do_mlock(void); +#else +static inline bool can_do_mlock(void) { return false; } +#endif +extern int user_shm_lock(size_t, struct ucounts *); +extern void user_shm_unlock(size_t, struct ucounts *); -/** - * mm_walk - callbacks for walk_page_range - * @pgd_entry: if set, called for each non-empty PGD (top-level) entry - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry - * this handler is required to be able to handle - * pmd_trans_huge() pmds. They may simply choose to - * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry - * @pte_hole: if set, called for each hole at all levels - * @hugetlb_entry: if set, called for each hugetlb entry - * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry - * is used. - * - * (see walk_page_range for more details) - */ -struct mm_walk { - int (*pgd_entry)(pgd_t *pgd, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pud_entry)(pud_t *pud, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pmd_entry)(pmd_t *pmd, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pte_entry)(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pte_hole)(unsigned long addr, unsigned long next, - struct mm_walk *walk); - int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, - unsigned long addr, unsigned long next, - struct mm_walk *walk); - struct mm_struct *mm; - void *private; -}; +struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); +struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd); +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd); +struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t pud); + +void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, + unsigned long size); +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details); +static inline void zap_vma_pages(struct vm_area_struct *vma) +{ + zap_page_range_single(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); +} +void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, + struct vm_area_struct *start_vma, unsigned long start, + unsigned long end, unsigned long tree_end); + +struct mmu_notifier_range; -int walk_page_range(unsigned long addr, unsigned long end, - struct mm_walk *walk); void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows); -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn); -int follow_phys(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long *prot, resource_size_t *phys); +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); -static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) -{ - unmap_mapping_range(mapping, holebegin, holelen, 0); -} +struct follow_pfnmap_args { + /** + * Inputs: + * @vma: Pointer to @vm_area_struct struct + * @address: the virtual address to walk + */ + struct vm_area_struct *vma; + unsigned long address; + /** + * Internals: + * + * The caller shouldn't touch any of these. + */ + spinlock_t *lock; + pte_t *ptep; + /** + * Outputs: + * + * @pfn: the PFN of the address + * @addr_mask: address mask covering pfn + * @pgprot: the pgprot_t of the mapping + * @writable: whether the mapping is writable + * @special: whether the mapping is a special mapping (real PFN maps) + */ + unsigned long pfn; + unsigned long addr_mask; + pgprot_t pgprot; + bool writable; + bool special; +}; +int follow_pfnmap_start(struct follow_pfnmap_args *args); +void follow_pfnmap_end(struct follow_pfnmap_args *args); -extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); +extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); -int truncate_inode_page(struct address_space *mapping, struct page *page); -int generic_error_remove_page(struct address_space *mapping, struct page *page); -int invalidate_inode_page(struct page *page); +int generic_error_remove_folio(struct address_space *mapping, + struct folio *folio); + +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long address, struct pt_regs *regs); #ifdef CONFIG_MMU -extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags); -extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags); +extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + struct pt_regs *regs); +extern int fixup_user_fault(struct mm_struct *mm, + unsigned long address, unsigned int fault_flags, + bool *unlocked); +void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows); +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); #else -static inline int handle_mm_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - unsigned int flags) +static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); return VM_FAULT_SIGBUS; } -static inline int fixup_user_fault(struct task_struct *tsk, - struct mm_struct *mm, unsigned long address, - unsigned int fault_flags) +static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, + unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); return -EFAULT; } +static inline void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows) { } +static inline void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, int write); - -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int foll_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking); -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - struct vm_area_struct **vmas); -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); -struct kvec; -int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, - struct page **pages); -int get_kernel_page(unsigned long start, int write, struct page **pages); -struct page *get_dump_page(unsigned long addr); - -extern int try_to_release_page(struct page * page, gfp_t gfp_mask); -extern void do_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); - -int __set_page_dirty_nobuffers(struct page *page); -int __set_page_dirty_no_writeback(struct page *page); -int redirty_page_for_writepage(struct writeback_control *wbc, - struct page *page); -void account_page_dirtied(struct page *page, struct address_space *mapping); -void account_page_writeback(struct page *page); -int set_page_dirty(struct page *page); -int set_page_dirty_lock(struct page *page); -int clear_page_dirty_for_io(struct page *page); - -/* Is the vma a continuation of the stack vma above it? */ -static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr) +static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) { - return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); + unmap_mapping_range(mapping, holebegin, holelen, 0); } -static inline int stack_guard_page_start(struct vm_area_struct *vma, - unsigned long addr) -{ - return (vma->vm_flags & VM_GROWSDOWN) && - (vma->vm_start == addr) && - !vma_growsdown(vma->vm_prev, addr); -} +static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, + unsigned long addr); -/* Is the vma a continuation of the stack vma below it? */ -static inline int vma_growsup(struct vm_area_struct *vma, unsigned long addr) -{ - return vma && (vma->vm_start == addr) && (vma->vm_flags & VM_GROWSUP); -} +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags); +extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags); -static inline int stack_guard_page_end(struct vm_area_struct *vma, - unsigned long addr) +#ifdef CONFIG_BPF_SYSCALL +extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags); +#endif + +long get_user_pages_remote(struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked); +long pin_user_pages_remote(struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked); + +/* + * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. + */ +static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, + unsigned long addr, + int gup_flags, + struct vm_area_struct **vmap) { - return (vma->vm_flags & VM_GROWSUP) && - (vma->vm_end == addr) && - !vma_growsup(vma->vm_next, addr); + struct page *page; + struct vm_area_struct *vma; + int got; + + if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) + return ERR_PTR(-EINVAL); + + got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); + + if (got < 0) + return ERR_PTR(got); + + vma = vma_lookup(mm, addr); + if (WARN_ON_ONCE(!vma)) { + put_page(page); + return ERR_PTR(-EINVAL); + } + + *vmap = vma; + return page; } -extern pid_t -vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group); +long get_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages); +long pin_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages); +long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags); +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags); +long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, + struct folio **folios, unsigned int max_folios, + pgoff_t *offset); +int folio_add_pins(struct folio *folio, unsigned int pins); + +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); +int pin_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); +void folio_add_pin(struct folio *folio); + +int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, + const struct task_struct *task, bool bypass_rlim); -extern unsigned long move_page_tables(struct vm_area_struct *vma, - unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len, - bool need_rmap_locks); -extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa); -extern int mprotect_fixup(struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags); +struct kvec; +struct page *get_dump_page(unsigned long addr, int *locked); + +bool folio_mark_dirty(struct folio *folio); +bool folio_mark_dirty_lock(struct folio *folio); +bool set_page_dirty(struct page *page); +int set_page_dirty_lock(struct page *page); + +int get_cmdline(struct task_struct *task, char *buffer, int buflen); + +/* + * Flags used by change_protection(). For now we make it a bitmap so + * that we can pass in multiple flags just like parameters. However + * for now all the callers are only use one of the flags at the same + * time. + */ +/* + * Whether we should manually check if we can map individual PTEs writable, + * because something (e.g., COW, uffd-wp) blocks that from happening for all + * PTEs automatically in a writable mapping. + */ +#define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) +/* Whether this protection change is for NUMA hints */ +#define MM_CP_PROT_NUMA (1UL << 1) +/* Whether this change is for write protecting */ +#define MM_CP_UFFD_WP (1UL << 2) /* do wp */ +#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ +#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ + MM_CP_UFFD_WP_RESOLVE) + +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); +extern long change_protection(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long cp_flags); +extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, vm_flags_t newflags); /* * doesn't attempt to fault and will return short. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); + +static inline bool get_user_page_fast_only(unsigned long addr, + unsigned int gup_flags, struct page **pagep) +{ + return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; +} /* * per-process(per-mm_struct) statistics. */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - long val = atomic_long_read(&mm->rss_stat.count[member]); + return percpu_counter_read_positive(&mm->rss_stat[member]); +} -#ifdef SPLIT_RSS_COUNTING - /* - * counter is updated in asynchronous manner and may go to minus. - * But it's never be expected number for users. - */ - if (val < 0) - val = 0; -#endif - return (unsigned long)val; +static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member) +{ + return percpu_counter_sum_positive(&mm->rss_stat[member]); } +void mm_trace_rss_stat(struct mm_struct *mm, int member); + static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - atomic_long_add(value, &mm->rss_stat.count[member]); + percpu_counter_add(&mm->rss_stat[member], value); + + mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - atomic_long_inc(&mm->rss_stat.count[member]); + percpu_counter_inc(&mm->rss_stat[member]); + + mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - atomic_long_dec(&mm->rss_stat.count[member]); + percpu_counter_dec(&mm->rss_stat[member]); + + mm_trace_rss_stat(mm, member); +} + +/* Optimized variant when folio is already known not to be anon */ +static inline int mm_counter_file(struct folio *folio) +{ + if (folio_test_swapbacked(folio)) + return MM_SHMEMPAGES; + return MM_FILEPAGES; +} + +static inline int mm_counter(struct folio *folio) +{ + if (folio_test_anon(folio)) + return MM_ANONPAGES; + return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) { return get_mm_counter(mm, MM_FILEPAGES) + - get_mm_counter(mm, MM_ANONPAGES); + get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); } static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) @@ -1157,8 +2915,8 @@ static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); - if ((mm)->hiwater_rss < _rss) - (mm)->hiwater_rss = _rss; + if (data_race(mm->hiwater_rss) < _rss) + data_race(mm->hiwater_rss = _rss); } static inline void update_hiwater_vm(struct mm_struct *mm) @@ -1167,6 +2925,11 @@ static inline void update_hiwater_vm(struct mm_struct *mm) mm->hiwater_vm = mm->total_vm; } +static inline void reset_mm_hiwater_rss(struct mm_struct *mm) +{ + mm->hiwater_rss = get_mm_rss(mm); +} + static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, struct mm_struct *mm) { @@ -1176,15 +2939,41 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, *maxrss = hiwater_rss; } -#if defined(SPLIT_RSS_COUNTING) -void sync_mm_rss(struct mm_struct *mm); -#else -static inline void sync_mm_rss(struct mm_struct *mm) +#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL +static inline int pte_special(pte_t pte) +{ + return 0; +} + +static inline pte_t pte_mkspecial(pte_t pte) { + return pte; } #endif -int vma_wants_writenotify(struct vm_area_struct *vma); +#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return false; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return false; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud; +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); @@ -1196,39 +2985,120 @@ static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, return ptep; } -#ifdef __PAGETABLE_PUD_FOLDED -static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, +#ifdef __PAGETABLE_P4D_FOLDED +static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return 0; } #else -int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); +int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif -#ifdef __PAGETABLE_PMD_FOLDED +#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) +static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, + unsigned long address) +{ + return 0; +} +static inline void mm_inc_nr_puds(struct mm_struct *mm) {} +static inline void mm_dec_nr_puds(struct mm_struct *mm) {} + +#else +int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); + +static inline void mm_inc_nr_puds(struct mm_struct *mm) +{ + if (mm_pud_folded(mm)) + return; + atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); +} + +static inline void mm_dec_nr_puds(struct mm_struct *mm) +{ + if (mm_pud_folded(mm)) + return; + atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); +} +#endif + +#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return 0; } + +static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} +static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} + #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); + +static inline void mm_inc_nr_pmds(struct mm_struct *mm) +{ + if (mm_pmd_folded(mm)) + return; + atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); +} + +static inline void mm_dec_nr_pmds(struct mm_struct *mm) +{ + if (mm_pmd_folded(mm)) + return; + atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); +} #endif -int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, - pmd_t *pmd, unsigned long address); -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); +#ifdef CONFIG_MMU +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->pgtables_bytes, 0); +} -/* - * The following ifdef needed to get the 4level-fixup.h header to work. - * Remove it when 4level-fixup.h has been removed. - */ -#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) -static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { - return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? - NULL: pud_offset(pgd, address); + return atomic_long_read(&mm->pgtables_bytes); +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) +{ + atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); +} + +static inline void mm_dec_nr_ptes(struct mm_struct *mm) +{ + atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); +} +#else + +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} +static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} +#endif + +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); +int __pte_alloc_kernel(pmd_t *pmd); + +#if defined(CONFIG_MMU) + +static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, + unsigned long address) +{ + return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ? + NULL : p4d_offset(pgd, address); +} + +static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, + unsigned long address) +{ + return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? + NULL : pud_offset(p4d, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) @@ -1236,73 +3106,380 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } -#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ +#endif /* CONFIG_MMU */ -#if USE_SPLIT_PTLOCKS -/* - * We tuck a spinlock to guard each pagetable page into its struct page, - * at page->private, with BUILD_BUG_ON to make sure that this will not - * overflow into the next struct page (as it might with DEBUG_SPINLOCK). - * When freeing, reset page->mapping so free_pages_check won't complain. +enum pt_flags { + PT_kernel = PG_referenced, + PT_reserved = PG_reserved, + /* High bits are used for zone/node/section */ +}; + +static inline struct ptdesc *virt_to_ptdesc(const void *x) +{ + return page_ptdesc(virt_to_page(x)); +} + +/** + * ptdesc_address - Virtual address of page table. + * @pt: Page table descriptor. + * + * Return: The first byte of the page table described by @pt. */ -#define __pte_lockptr(page) &((page)->ptl) -#define pte_lock_init(_page) do { \ - spin_lock_init(__pte_lockptr(_page)); \ -} while (0) -#define pte_lock_deinit(page) ((page)->mapping = NULL) -#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) -#else /* !USE_SPLIT_PTLOCKS */ +static inline void *ptdesc_address(const struct ptdesc *pt) +{ + return folio_address(ptdesc_folio(pt)); +} + +static inline bool pagetable_is_reserved(struct ptdesc *pt) +{ + return test_bit(PT_reserved, &pt->pt_flags.f); +} + +/** + * ptdesc_set_kernel - Mark a ptdesc used to map the kernel + * @ptdesc: The ptdesc to be marked + * + * Kernel page tables often need special handling. Set a flag so that + * the handling code knows this ptdesc will not be used for userspace. + */ +static inline void ptdesc_set_kernel(struct ptdesc *ptdesc) +{ + set_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel + * @ptdesc: The ptdesc to be unmarked + * + * Use when the ptdesc is no longer used to map the kernel and no longer + * needs special handling. + */ +static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) +{ + /* + * Note: the 'PG_referenced' bit does not strictly need to be + * cleared before freeing the page. But this is nice for + * symmetry. + */ + clear_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel + * @ptdesc: The ptdesc being tested + * + * Call to tell if the ptdesc used to map the kernel. + */ +static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) +{ + return test_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * pagetable_alloc - Allocate pagetables + * @gfp: GFP flags + * @order: desired pagetable order + * + * pagetable_alloc allocates memory for page tables as well as a page table + * descriptor to describe that memory. + * + * Return: The ptdesc describing the allocated page tables. + */ +static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) +{ + struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); + + return page_ptdesc(page); +} +#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) + +static inline void __pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +void pagetable_free_kernel(struct ptdesc *pt); +#else +static inline void pagetable_free_kernel(struct ptdesc *pt) +{ + __pagetable_free(pt); +} +#endif +/** + * pagetable_free - Free pagetables + * @pt: The page table descriptor + * + * pagetable_free frees the memory of all page tables described by a page + * table descriptor and the memory for the descriptor itself. + */ +static inline void pagetable_free(struct ptdesc *pt) +{ + if (ptdesc_test_kernel(pt)) { + ptdesc_clear_kernel(pt); + pagetable_free_kernel(pt); + } else { + __pagetable_free(pt); + } +} + +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) +#if ALLOC_SPLIT_PTLOCKS +void __init ptlock_cache_init(void); +bool ptlock_alloc(struct ptdesc *ptdesc); +void ptlock_free(struct ptdesc *ptdesc); + +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) +{ + return ptdesc->ptl; +} +#else /* ALLOC_SPLIT_PTLOCKS */ +static inline void ptlock_cache_init(void) +{ +} + +static inline bool ptlock_alloc(struct ptdesc *ptdesc) +{ + return true; +} + +static inline void ptlock_free(struct ptdesc *ptdesc) +{ +} + +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) +{ + return &ptdesc->ptl; +} +#endif /* ALLOC_SPLIT_PTLOCKS */ + +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); +} + +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); + BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); + return ptlock_ptr(virt_to_ptdesc(pte)); +} + +static inline bool ptlock_init(struct ptdesc *ptdesc) +{ + /* + * prep_new_page() initialize page->private (and therefore page->ptl) + * with 0. Make sure nobody took it in use in between. + * + * It can happen if arch try to use slab for page table allocation: + * slab code uses page->slab_cache, which share storage with page->ptl. + */ + VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); + if (!ptlock_alloc(ptdesc)) + return false; + spin_lock_init(ptlock_ptr(ptdesc)); + return true; +} + +#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ -#define pte_lock_init(page) do {} while (0) -#define pte_lock_deinit(page) do {} while (0) -#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) -#endif /* USE_SPLIT_PTLOCKS */ +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return &mm->page_table_lock; +} +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + return &mm->page_table_lock; +} +static inline void ptlock_cache_init(void) {} +static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } +static inline void ptlock_free(struct ptdesc *ptdesc) {} +#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ + +static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc) +{ + return compound_nr(ptdesc_page(ptdesc)); +} -static inline void pgtable_page_ctor(struct page *page) +static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - pte_lock_init(page); - inc_zone_page_state(page, NR_PAGETABLE); + pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); + + __SetPageTable(ptdesc_page(ptdesc)); + mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc)); } -static inline void pgtable_page_dtor(struct page *page) +static inline void pagetable_dtor(struct ptdesc *ptdesc) { - pte_lock_deinit(page); - dec_zone_page_state(page, NR_PAGETABLE); + pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); + + ptlock_free(ptdesc); + __ClearPageTable(ptdesc_page(ptdesc)); + mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc)); } -#define pte_offset_map_lock(mm, pmd, address, ptlp) \ -({ \ - spinlock_t *__ptl = pte_lockptr(mm, pmd); \ - pte_t *__pte = pte_offset_map(pmd, address); \ - *(ptlp) = __ptl; \ - spin_lock(__ptl); \ - __pte; \ -}) +static inline void pagetable_dtor_free(struct ptdesc *ptdesc) +{ + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} + +static inline bool pagetable_pte_ctor(struct mm_struct *mm, + struct ptdesc *ptdesc) +{ + if (mm != &init_mm && !ptlock_init(ptdesc)) + return false; + __pagetable_ctor(ptdesc); + return true; +} + +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, + pmd_t *pmdvalp) +{ + pte_t *pte; + + __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); + return pte; +} +static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) +{ + return __pte_offset_map(pmd, addr, NULL); +} + +pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); +static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) +{ + pte_t *pte; + + __cond_lock(RCU, __cond_lock(*ptlp, + pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); + return pte; +} + +pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); +pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdvalp, + spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ pte_unmap(pte); \ } while (0) -#define pte_alloc_map(mm, vma, pmd, address) \ - ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ - pmd, address))? \ - NULL: pte_offset_map(pmd, address)) +#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) + +#define pte_alloc_map(mm, pmd, address) \ + (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ - pmd, address))? \ - NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) + (pte_alloc(mm, pmd) ? \ + NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ - ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) -extern void free_area_init(unsigned long * zones_size); -extern void free_area_init_node(int nid, unsigned long * zones_size, - unsigned long zone_start_pfn, unsigned long *zholes_size); +#if defined(CONFIG_SPLIT_PMD_PTLOCKS) + +static inline struct page *pmd_pgtable_page(pmd_t *pmd) +{ + unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); + return virt_to_page((void *)((unsigned long) pmd & mask)); +} + +static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) +{ + return page_ptdesc(pmd_pgtable_page(pmd)); +} + +static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return ptlock_ptr(pmd_ptdesc(pmd)); +} + +static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ptdesc->pmd_huge_pte = NULL; +#endif + return ptlock_init(ptdesc); +} + +#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) + +#else + +static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return &mm->page_table_lock; +} + +static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } + +#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) + +#endif + +static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) +{ + spinlock_t *ptl = pmd_lockptr(mm, pmd); + spin_lock(ptl); + return ptl; +} + +static inline bool pagetable_pmd_ctor(struct mm_struct *mm, + struct ptdesc *ptdesc) +{ + if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) + return false; + ptdesc_pmd_pts_init(ptdesc); + __pagetable_ctor(ptdesc); + return true; +} + +/* + * No scalability reason to split PUD locks yet, but follow the same pattern + * as the PMD locks to make it easier if we decide to. The VM should not be + * considered ready to switch to split PUD locks yet; there may be places + * which need to be converted from page_table_lock. + */ +static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud) +{ + return &mm->page_table_lock; +} + +static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) +{ + spinlock_t *ptl = pud_lockptr(mm, pud); + + spin_lock(ptl); + return ptl; +} + +static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); +} + +static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); +} + +static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); +} + +extern void __init pagecache_init(void); extern void free_initmem(void); /* @@ -1312,32 +3489,15 @@ extern void free_initmem(void); * Return pages freed into the buddy system. */ extern unsigned long free_reserved_area(void *start, void *end, - int poison, char *s); - -#ifdef CONFIG_HIGHMEM -/* - * Free a highmem page into the buddy system, adjusting totalhigh_pages - * and totalram_pages. - */ -extern void free_highmem_page(struct page *page); -#endif + int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); -extern void mem_init_print_info(const char *str); -/* Free the reserved page into the buddy system, so it gets managed. */ -static inline void __free_reserved_page(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); -} +extern void reserve_bootmem_region(phys_addr_t start, + phys_addr_t end, int nid); -static inline void free_reserved_page(struct page *page) -{ - __free_reserved_page(page); - adjust_managed_page_count(page, 1); -} +/* Free the reserved page into the buddy system, so it gets managed. */ +void free_reserved_page(struct page *page); static inline void mark_page_reserved(struct page *page) { @@ -1345,6 +3505,11 @@ static inline void mark_page_reserved(struct page *page) adjust_managed_page_count(page, -1); } +static inline void free_reserved_ptdesc(struct ptdesc *pt) +{ + free_reserved_page(ptdesc_page(pt)); +} + /* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within @@ -1356,7 +3521,7 @@ static inline unsigned long free_initmem_default(int poison) extern char __init_begin[], __init_end[]; return free_reserved_area(&__init_begin, &__init_end, - poison, "unused kernel"); + poison, "unused kernel image (initmem)"); } static inline unsigned long get_num_physpages(void) @@ -1370,98 +3535,71 @@ static inline unsigned long get_num_physpages(void) return phys_pages; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its - * zones, allocate the backing mem_map and account for memory holes in a more - * architecture independent manner. This is a substitute for creating the - * zone_sizes[] and zholes_size[] arrays and passing them to - * free_area_init_node() + * Using memblock node mappings, an architecture may initialise its + * zones, allocate the backing mem_map and account for memory holes in an + * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling - * free_area_init_nodes() passing in the PFN each zone ends at. At a basic + * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() - * memblock_add_node(base, size, nid) - * free_area_init_nodes(max_zone_pfns); - * - * free_bootmem_with_active_regions() calls free_bootmem_node() for each - * registered physical page range. Similarly - * sparse_memory_present_with_active_regions() calls memory_present() for - * each range when SPARSEMEM is enabled. - * - * See mm/page_alloc.c for more information on each function exposed by - * CONFIG_HAVE_MEMBLOCK_NODE_MAP. + * memblock_add_node(base, size, nid, MEMBLOCK_NONE) + * free_area_init(max_zone_pfns); */ -extern void free_area_init_nodes(unsigned long *max_zone_pfn); +void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); -unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, - unsigned long end_pfn); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); -extern unsigned long find_min_pfn_with_active_regions(void); -extern void free_bootmem_with_active_regions(int nid, - unsigned long max_low_pfn); -extern void sparse_memory_present_with_active_regions(int nid); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ - !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn) +#ifndef CONFIG_NUMA +static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); -#ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -/* there is a per-arch backend function. */ -extern int __meminit __early_pfn_to_nid(unsigned long pfn); -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ #endif -extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_zone(unsigned long, int, unsigned long, - unsigned long, enum memmap_context); -extern void setup_per_zone_wmarks(void); -extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(unsigned int flags); + +extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); +static inline void show_mem(void) +{ + __show_mem(0, NULL, MAX_NR_ZONES - 1); +} +extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern __printf(3, 4) -void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); -extern void zone_pcp_update(struct zone *zone); -extern void zone_pcp_reset(struct zone *zone); - -/* page_alloc.c */ -extern int min_free_kbytes; - /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, - struct rb_root *root); + struct rb_root_cached *root); void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, - struct rb_root *root); + struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, - struct rb_root *root); -struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root *root, + struct rb_root_cached *root); +struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node, + unsigned long start, unsigned long last); +struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, unsigned long start, unsigned long last); @@ -1470,18 +3608,13 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) -static inline void vma_nonlinear_insert(struct vm_area_struct *vma, - struct list_head *list) -{ - list_add_tail(&vma->shared.nonlinear, list); -} - void anon_vma_interval_tree_insert(struct anon_vma_chain *node, - struct rb_root *root); + struct rb_root_cached *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, - struct rb_root *root); -struct anon_vma_chain *anon_vma_interval_tree_iter_first( - struct rb_root *root, unsigned long start, unsigned long last); + struct rb_root_cached *root); +struct anon_vma_chain * +anon_vma_interval_tree_iter_first(struct rb_root_cached *root, + unsigned long start, unsigned long last); struct anon_vma_chain *anon_vma_interval_tree_iter_next( struct anon_vma_chain *node, unsigned long start, unsigned long last); #ifdef CONFIG_DEBUG_VM_RB @@ -1493,44 +3626,71 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) /* mmap.c */ -extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); -extern struct vm_area_struct *vma_merge(struct mm_struct *, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *); -extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int split_vma(struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); +extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, - struct rb_node **, struct rb_node *); -extern void unlink_file_vma(struct vm_area_struct *); -extern struct vm_area_struct *copy_vma(struct vm_area_struct **, - unsigned long addr, unsigned long len, pgoff_t pgoff, - bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write); + +static inline int check_data_rlimit(unsigned long rlim, + unsigned long new, + unsigned long start, + unsigned long end_data, + unsigned long start_data) +{ + if (rlim < RLIM_INFINITY) { + if (((new - start) + (end_data - start_data)) > rlim) + return -ENOSPC; + } + + return 0; +} extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); -extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); +extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); +extern struct file *get_task_exe_file(struct task_struct *task); -extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); -extern int install_special_mapping(struct mm_struct *mm, +extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); +extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); + +extern bool vma_is_special_mapping(const struct vm_area_struct *vma, + const struct vm_special_mapping *sm); +struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long flags, struct page **pages); + vm_flags_t vm_flags, + const struct vm_special_mapping *spec); + +unsigned long randomize_stack_top(unsigned long stack_top); +unsigned long randomize_page(unsigned long start, unsigned long range); + +unsigned long +__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +static inline unsigned long +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return __get_unmapped_area(file, addr, len, pgoff, flags, 0); +} -extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); -extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff, unsigned long *populate); -extern int do_munmap(struct mm_struct *, unsigned long, size_t); + vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, + struct list_head *uf); +extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool unlock); +int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool unlock); +extern int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf); +extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, @@ -1544,10 +3704,10 @@ static inline void mm_populate(unsigned long addr, unsigned long len) static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif -/* These take the mm semaphore themselves */ -extern unsigned long vm_brk(unsigned long, unsigned long); +/* This takes the mm semaphore itself */ +extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); -extern unsigned long vm_mmap(struct file *, unsigned long, +extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); @@ -1559,105 +3719,182 @@ struct vm_unmapped_area_info { unsigned long high_limit; unsigned long align_mask; unsigned long align_offset; + unsigned long start_gap; }; -extern unsigned long unmapped_area(struct vm_unmapped_area_info *info); -extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); - -/* - * Search for an unmapped address range. - * - * We are looking for a range that: - * - does not intersect with any VMA; - * - is contained within the [low_limit, high_limit) interval; - * - is at least the desired size. - * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) - */ -static inline unsigned long -vm_unmapped_area(struct vm_unmapped_area_info *info) -{ - if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN)) - return unmapped_area(info); - else - return unmapped_area_topdown(info); -} +extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ -extern void truncate_inode_pages(struct address_space *, loff_t); -extern void truncate_inode_pages_range(struct address_space *, - loff_t lstart, loff_t lend); +void truncate_inode_pages(struct address_space *mapping, loff_t lstart); +void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, + uoff_t lend); +void truncate_inode_pages_final(struct address_space *mapping); /* generic vm_area_ops exported for stackable file systems */ -extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); -extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); - -/* mm/page-writeback.c */ -int write_one_page(struct page *page, int wait); -void task_dirty_inc(struct task_struct *tsk); - -/* readahead.c */ -#define VM_MAX_READAHEAD 128 /* kbytes */ -#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ - -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); - -void page_cache_sync_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - pgoff_t offset, - unsigned long size); - -void page_cache_async_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - struct page *pg, - pgoff_t offset, - unsigned long size); - -unsigned long max_sane_readahead(unsigned long nr); -unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, - struct file *filp); +extern vm_fault_t filemap_fault(struct vm_fault *vmf); +extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff); +extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); +extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -extern int expand_stack(struct vm_area_struct *vma, unsigned long address); - -/* CONFIG_STACK_GROWSUP still needs to to grow downwards at some places */ -extern int expand_downwards(struct vm_area_struct *vma, - unsigned long address); -#if VM_GROWSUP -extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); -#else - #define expand_upwards(vma, address) do { } while (0) -#endif +int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); +struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +/* + * Look up the first VMA which intersects the interval [start_addr, end_addr) + * NULL if none. Assume start_addr < end_addr. + */ +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, unsigned long end_addr); + +/** + * vma_lookup() - Find a VMA at a specific address + * @mm: The process address space. + * @addr: The user address. + * + * Return: The vm_area_struct at the given address, %NULL otherwise. + */ +static inline +struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct * vma = find_vma(mm,start_addr); + return mtree_load(&mm->mm_mt, addr); +} - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; +static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_GROWSDOWN) + return stack_guard_gap; + + /* See reasoning around the VM_SHADOW_STACK definition */ + if (vma->vm_flags & VM_SHADOW_STACK) + return PAGE_SIZE; + + return 0; +} + +static inline unsigned long vm_start_gap(const struct vm_area_struct *vma) +{ + unsigned long gap = stack_guard_start_gap(vma); + unsigned long vm_start = vma->vm_start; + + vm_start -= gap; + if (vm_start > vma->vm_start) + vm_start = 0; + return vm_start; } -static inline unsigned long vma_pages(struct vm_area_struct *vma) +static inline unsigned long vm_end_gap(const struct vm_area_struct *vma) +{ + unsigned long vm_end = vma->vm_end; + + if (vma->vm_flags & VM_GROWSUP) { + vm_end += stack_guard_gap; + if (vm_end < vma->vm_end) + vm_end = -PAGE_SIZE; + } + return vm_end; +} + +static inline unsigned long vma_pages(const struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) +{ + return desc->end - desc->start; +} + +static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) +{ + return vma_desc_size(desc) >> PAGE_SHIFT; +} + +/** + * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN + * remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_remap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + /* [start, start + size) must be within the VMA. */ + WARN_ON_ONCE(start < desc->start || start >= desc->end); + WARN_ON_ONCE(start + size > desc->end); + + action->type = MMAP_REMAP_PFN; + action->remap.start = start; + action->remap.start_pfn = start_pfn; + action->remap.size = size; + action->remap.pgprot = desc->page_prot; +} + +/** + * mmap_action_remap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_remap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +/** + * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN + * I/O remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_ioremap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + mmap_action_remap(desc, start, start_pfn, size); + desc->action.type = MMAP_IO_REMAP_PFN; +} + +/** + * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN I/O remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc); +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma); + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { - struct vm_area_struct *vma = find_vma(mm, vm_start); + struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; @@ -1665,101 +3902,317 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } +static inline bool range_in_vma(const struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + return (vma && vma->vm_start <= start && end <= vma->vm_end); +} + #ifdef CONFIG_MMU -pgprot_t vm_get_page_prot(unsigned long vm_flags); +pgprot_t vm_get_page_prot(vm_flags_t vm_flags); +void vma_set_page_prot(struct vm_area_struct *vma); #else -static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) +static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags) { return __pgprot(0); } +static inline void vma_set_page_prot(struct vm_area_struct *vma) +{ + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); +} #endif -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +void vma_set_file(struct vm_area_struct *vma, struct file *file); + +#ifdef CONFIG_NUMA_BALANCING unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif -struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); -int remap_pfn_range(struct vm_area_struct *, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t); +struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, + unsigned long addr); +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num); +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num); +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num); +vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, + bool write); +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot); +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); +static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + int err = vm_insert_page(vma, addr, page); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int foll_flags, - unsigned int *page_mask); + return VM_FAULT_NOPAGE; +} -static inline struct page *follow_page(struct vm_area_struct *vma, - unsigned long address, unsigned int foll_flags) +#ifndef io_remap_pfn_range_pfn +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { - unsigned int unused_page_mask; - return follow_page_mask(vma, address, foll_flags, &unused_page_mask); + return pfn; } +#endif + +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long orig_pfn, + unsigned long size, pgprot_t orig_prot) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + const pgprot_t prot = pgprot_decrypted(orig_prot); -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ -#define FOLL_GET 0x04 /* do get_page on page */ -#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ -#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ -#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO - * and return without waiting upon it */ -#define FOLL_MLOCK 0x40 /* mark page as mlocked */ -#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ -#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ -#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ + return remap_pfn_range(vma, addr, pfn, size, prot); +} -typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, - void *data); +static inline vm_fault_t vmf_error(int err) +{ + if (err == -ENOMEM) + return VM_FAULT_OOM; + else if (err == -EHWPOISON) + return VM_FAULT_HWPOISON; + return VM_FAULT_SIGBUS; +} + +/* + * Convert errno to return value for ->page_mkwrite() calls. + * + * This should eventually be merged with vmf_error() above, but will need a + * careful audit of all vmf_error() callers. + */ +static inline vm_fault_t vmf_fs_error(int err) +{ + if (err == 0) + return VM_FAULT_LOCKED; + if (err == -EFAULT || err == -EAGAIN) + return VM_FAULT_NOPAGE; + if (err == -ENOMEM) + return VM_FAULT_OOM; + /* -ENOSPC, -EDQUOT, -EIO ... */ + return VM_FAULT_SIGBUS; +} + +static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) +{ + if (vm_fault & VM_FAULT_OOM) + return -ENOMEM; + if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT; + if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) + return -EFAULT; + return 0; +} + +/* + * Indicates whether GUP can follow a PROT_NONE mapped page, or whether + * a (NUMA hinting) fault is required. + */ +static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma, + unsigned int flags) +{ + /* + * If callers don't want to honor NUMA hinting faults, no need to + * determine if we would actually have to trigger a NUMA hinting fault. + */ + if (!(flags & FOLL_HONOR_NUMA_FAULT)) + return true; + + /* + * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. + * + * Requiring a fault here even for inaccessible VMAs would mean that + * FOLL_FORCE cannot make any progress, because handle_mm_fault() + * refuses to process NUMA hinting faults in inaccessible VMAs. + */ + return !vma_is_accessible(vma); +} + +typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); - -#ifdef CONFIG_PROC_FS -void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); +extern int apply_to_existing_page_range(struct mm_struct *mm, + unsigned long address, unsigned long size, + pte_fn_t fn, void *data); + +#ifdef CONFIG_PAGE_POISONING +extern void __kernel_poison_pages(struct page *page, int numpages); +extern void __kernel_unpoison_pages(struct page *page, int numpages); +extern bool _page_poisoning_enabled_early; +DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); +static inline bool page_poisoning_enabled(void) +{ + return _page_poisoning_enabled_early; +} +/* + * For use in fast paths after init_mem_debugging() has run, or when a + * false negative result is not harmful when called too early. + */ +static inline bool page_poisoning_enabled_static(void) +{ + return static_branch_unlikely(&_page_poisoning_enabled); +} +static inline void kernel_poison_pages(struct page *page, int numpages) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, numpages); +} +static inline void kernel_unpoison_pages(struct page *page, int numpages) +{ + if (page_poisoning_enabled_static()) + __kernel_unpoison_pages(page, numpages); +} #else -static inline void vm_stat_account(struct mm_struct *mm, - unsigned long flags, struct file *file, long pages) +static inline bool page_poisoning_enabled(void) { return false; } +static inline bool page_poisoning_enabled_static(void) { return false; } +static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } +static inline void kernel_poison_pages(struct page *page, int numpages) { } +static inline void kernel_unpoison_pages(struct page *page, int numpages) { } +#endif + +DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); +static inline bool want_init_on_alloc(gfp_t flags) +{ + if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc)) + return true; + return flags & __GFP_ZERO; +} + +DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); +static inline bool want_init_on_free(void) { - mm->total_vm += pages; + return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, + &init_on_free); } -#endif /* CONFIG_PROC_FS */ +extern bool _debug_pagealloc_enabled_early; +DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); + +static inline bool debug_pagealloc_enabled(void) +{ + return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && + _debug_pagealloc_enabled_early; +} + +/* + * For use in fast paths after mem_debugging_and_hardening_init() has run, + * or when a false negative result is not harmful when called too early. + */ +static inline bool debug_pagealloc_enabled_static(void) +{ + if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) + return false; + + return static_branch_unlikely(&_debug_pagealloc_enabled); +} + +/* + * To support DEBUG_PAGEALLOC architecture must ensure that + * __kernel_map_pages() never fails + */ +extern void __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC -extern void kernel_map_pages(struct page *page, int numpages, int enable); -#ifdef CONFIG_HIBERNATION -extern bool kernel_page_present(struct page *page); -#endif /* CONFIG_HIBERNATION */ -#else -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) {} -#ifdef CONFIG_HIBERNATION -static inline bool kernel_page_present(struct page *page) { return true; } -#endif /* CONFIG_HIBERNATION */ -#endif +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 1); +} +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) +{ + if (debug_pagealloc_enabled_static()) + __kernel_map_pages(page, numpages, 0); +} + +extern unsigned int _debug_guardpage_minorder; +DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); + +static inline unsigned int debug_guardpage_minorder(void) +{ + return _debug_guardpage_minorder; +} + +static inline bool debug_guardpage_enabled(void) +{ + return static_branch_unlikely(&_debug_guardpage_enabled); +} + +static inline bool page_is_guard(const struct page *page) +{ + if (!debug_guardpage_enabled()) + return false; + + return PageGuard(page); +} + +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order) +{ + if (!debug_guardpage_enabled()) + return false; + return __set_page_guard(zone, page, order); +} + +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order) +{ + if (!debug_guardpage_enabled()) + return; + __clear_page_guard(zone, page, order); +} + +#else /* CONFIG_DEBUG_PAGEALLOC */ +static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} +static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} +static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool debug_guardpage_enabled(void) { return false; } +static inline bool page_is_guard(const struct page *page) { return false; } +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order) { return false; } +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order) {} +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +#ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); -#ifdef __HAVE_ARCH_GATE_AREA -int in_gate_area_no_mm(unsigned long addr); -int in_gate_area(struct mm_struct *mm, unsigned long addr); +extern int in_gate_area_no_mm(unsigned long addr); +extern int in_gate_area(struct mm_struct *mm, unsigned long addr); #else -int in_gate_area_no_mm(unsigned long addr); -#define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);}) +static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return NULL; +} +static inline int in_gate_area_no_mm(unsigned long addr) { return 0; } +static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + return 0; +} #endif /* __HAVE_ARCH_GATE_AREA */ -#ifdef CONFIG_SYSCTL -extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -#endif +bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm); -unsigned long shrink_slab(struct shrink_control *shrink, - unsigned long nr_pages_scanned, - unsigned long lru_pages); +void drop_slab(void); #ifndef CONFIG_MMU #define randomize_va_space 0 @@ -1768,73 +4221,247 @@ extern int randomize_va_space; #endif const char * arch_vma_name(struct vm_area_struct *vma); +#ifdef CONFIG_MMU void print_vma_addr(char *prefix, unsigned long rip); +#else +static inline void print_vma_addr(char *prefix, unsigned long rip) +{ +} +#endif -void sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, - int nodeid); - -struct page *sparse_mem_map_populate(unsigned long pnum, int nid); +void *sparse_buffer_alloc(unsigned long size); +unsigned long section_map_size(void); +struct page * __populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); -pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); +p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); +pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); -pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); +pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, + struct vmem_altmap *altmap, unsigned long ptpfn, + unsigned long flags); void *vmemmap_alloc_block(unsigned long size, int node); -void *vmemmap_alloc_block_buf(unsigned long size, int node); +struct vmem_altmap; +void *vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); +void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next); +int vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, - int node); -int vmemmap_populate(unsigned long start, unsigned long end, int node); + int node, struct vmem_altmap *altmap); +int vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap); +int vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap); +int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); +int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); +void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node, + unsigned long headsize); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG -void vmemmap_free(unsigned long start, unsigned long end); +void vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap); +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + if (altmap) + return altmap->reserve + altmap->free; + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} +#else +static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) +{ + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ +} +#endif + +#define VMEMMAP_RESERVE_NR 2 +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP +static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + unsigned long nr_pages; + unsigned long nr_vmemmap_pages; + + if (!pgmap || !is_power_of_2(sizeof(struct page))) + return false; + + nr_pages = pgmap_vmemmap_nr(pgmap); + nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); + /* + * For vmemmap optimization with DAX we need minimum 2 vmemmap + * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst + */ + return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); +} +/* + * If we don't have an architecture override, use the generic rule + */ +#ifndef vmemmap_can_optimize +#define vmemmap_can_optimize __vmemmap_can_optimize +#endif + +#else +static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + return false; +} #endif -void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, - unsigned long size); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, + MF_SOFT_OFFLINE = 1 << 3, + MF_UNPOISON = 1 << 4, + MF_SW_SIMULATED = 1 << 5, + MF_NO_RETRY = 1 << 6, + MF_MEM_PRE_REMOVE = 1 << 7, }; -extern int memory_failure(unsigned long pfn, int trapno, int flags); -extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags); +extern int memory_failure(unsigned long pfn, int flags); extern int unpoison_memory(unsigned long pfn); -extern int sysctl_memory_failure_early_kill; -extern int sysctl_memory_failure_recovery; -extern void shake_page(struct page *p, int access); -extern atomic_long_t num_poisoned_pages; -extern int soft_offline_page(struct page *page, int flags); +extern atomic_long_t num_poisoned_pages __read_mostly; +extern int soft_offline_page(unsigned long pfn, int flags); +#ifdef CONFIG_MEMORY_FAILURE +/* + * Sysfs entries for memory failure handling statistics. + */ +extern const struct attribute_group memory_failure_attr_group; +extern void memory_failure_queue(unsigned long pfn, int flags); +extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared); +void num_poisoned_pages_inc(unsigned long pfn); +void num_poisoned_pages_sub(unsigned long pfn, long i); +#else +static inline void memory_failure_queue(unsigned long pfn, int flags) +{ +} -extern void dump_page(struct page *page); +static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) +{ + return 0; +} -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) -extern void clear_huge_page(struct page *page, - unsigned long addr, - unsigned int pages_per_huge_page); -extern void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma, - unsigned int pages_per_huge_page); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +static inline void num_poisoned_pages_inc(unsigned long pfn) +{ +} -#ifdef CONFIG_DEBUG_PAGEALLOC -extern unsigned int _debug_guardpage_minorder; +static inline void num_poisoned_pages_sub(unsigned long pfn, long i) +{ +} +#endif -static inline unsigned int debug_guardpage_minorder(void) +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +extern void memblk_nr_poison_inc(unsigned long pfn); +extern void memblk_nr_poison_sub(unsigned long pfn, long i); +#else +static inline void memblk_nr_poison_inc(unsigned long pfn) { - return _debug_guardpage_minorder; } -static inline bool page_is_guard(struct page *page) +static inline void memblk_nr_poison_sub(unsigned long pfn, long i) { - return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); } -#else -static inline unsigned int debug_guardpage_minorder(void) { return 0; } -static inline bool page_is_guard(struct page *page) { return false; } -#endif /* CONFIG_DEBUG_PAGEALLOC */ +#endif + +#ifndef arch_memory_failure +static inline int arch_memory_failure(unsigned long pfn, int flags) +{ + return -ENXIO; +} +#endif + +#ifndef arch_is_platform_page +static inline bool arch_is_platform_page(u64 paddr) +{ + return false; +} +#endif + +/* + * Error handlers for various types of pages. + */ +enum mf_result { + MF_IGNORED, /* Error: cannot be handled */ + MF_FAILED, /* Error: handling failed */ + MF_DELAYED, /* Will be handled later */ + MF_RECOVERED, /* Successfully recovered */ +}; + +enum mf_action_page_type { + MF_MSG_KERNEL, + MF_MSG_KERNEL_HIGH_ORDER, + MF_MSG_DIFFERENT_COMPOUND, + MF_MSG_HUGE, + MF_MSG_FREE_HUGE, + MF_MSG_GET_HWPOISON, + MF_MSG_UNMAP_FAILED, + MF_MSG_DIRTY_SWAPCACHE, + MF_MSG_CLEAN_SWAPCACHE, + MF_MSG_DIRTY_MLOCKED_LRU, + MF_MSG_CLEAN_MLOCKED_LRU, + MF_MSG_DIRTY_UNEVICTABLE_LRU, + MF_MSG_CLEAN_UNEVICTABLE_LRU, + MF_MSG_DIRTY_LRU, + MF_MSG_CLEAN_LRU, + MF_MSG_TRUNCATED_LRU, + MF_MSG_BUDDY, + MF_MSG_DAX, + MF_MSG_UNSPLIT_THP, + MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, + MF_MSG_UNKNOWN, +}; + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +void folio_zero_user(struct folio *folio, unsigned long addr_hint); +int copy_user_large_folio(struct folio *dst, struct folio *src, + unsigned long addr_hint, + struct vm_area_struct *vma); +long copy_folio_from_user(struct folio *dst_folio, + const void __user *usr_src, + bool allow_pagefault); + +/** + * vma_is_special_huge - Are transhuge page-table entries considered special? + * @vma: Pointer to the struct vm_area_struct to consider + * + * Whether transhuge page-table entries are considered "special" following + * the definition in vm_normal_page(). + * + * Return: true if transhuge page-table entries should be considered special, + * false otherwise. + */ +static inline bool vma_is_special_huge(const struct vm_area_struct *vma) +{ + return vma_is_dax(vma) || (vma->vm_file && + (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); +} + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if MAX_NUMNODES > 1 void __init setup_nr_node_ids(void); @@ -1842,5 +4469,177 @@ void __init setup_nr_node_ids(void); static inline void setup_nr_node_ids(void) {} #endif -#endif /* __KERNEL__ */ +extern int memcmp_pages(struct page *page1, struct page *page2); + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + +#ifdef CONFIG_MAPPING_DIRTY_HELPERS +unsigned long clean_record_shared_mapping_range(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr, + pgoff_t bitmap_pgoff, + unsigned long *bitmap, + pgoff_t *start, + pgoff_t *end); + +unsigned long wp_shared_mapping_range(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr); +#endif + +#ifdef CONFIG_ANON_VMA_NAME +int set_anon_vma_name(unsigned long addr, unsigned long size, + const char __user *uname); +#else +static inline +int set_anon_vma_name(unsigned long addr, unsigned long size, + const char __user *uname) +{ + return -EINVAL; +} +#endif + +#ifdef CONFIG_UNACCEPTED_MEMORY + +bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); +void accept_memory(phys_addr_t start, unsigned long size); + +#else + +static inline bool range_contains_unaccepted_memory(phys_addr_t start, + unsigned long size) +{ + return false; +} + +static inline void accept_memory(phys_addr_t start, unsigned long size) +{ +} + +#endif + +static inline bool pfn_is_unaccepted_memory(unsigned long pfn) +{ + return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); +} + +void vma_pgtable_walk_begin(struct vm_area_struct *vma); +void vma_pgtable_walk_end(struct vm_area_struct *vma); + +int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); +int reserve_mem_release_by_name(const char *name); + +#ifdef CONFIG_64BIT +int do_mseal(unsigned long start, size_t len_in, unsigned long flags); +#else +static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags) +{ + /* noop on 32 bit */ + return 0; +} +#endif + +/* + * user_alloc_needs_zeroing checks if a user folio from page allocator needs to + * be zeroed or not. + */ +static inline bool user_alloc_needs_zeroing(void) +{ + /* + * for user folios, arch with cache aliasing requires cache flush and + * arc changes folio->flags to make icache coherent with dcache, so + * always return false to make caller use + * clear_user_page()/clear_user_highpage(). + */ + return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || + !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); +} + +int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); +int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); +int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); + +/* + * DMA mapping IDs for page_pool + * + * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot + * mistake a valid kernel pointer with any of the values we write into this + * field. + * + * On architectures that set POISON_POINTER_DELTA, this is already ensured, + * since this value becomes part of PP_SIGNATURE; meaning we can just use the + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is + * 0, we use the lowest bit of PAGE_OFFSET as the boundary if that value is + * known at compile-time. + * + * If the value of PAGE_OFFSET is not known at compile time, or if it is too + * small to leave at least 8 bits available above PP_SIGNATURE, we define the + * number of bits to be 0, which turns off the DMA index tracking altogether + * (see page_pool_register_dma_index()). + */ +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) +#if POISON_POINTER_DELTA > 0 +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA + * index to not overlap with that if set + */ +#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) +#else +/* Use the lowest bit of PAGE_OFFSET if there's at least 8 bits available; see above */ +#define PP_DMA_INDEX_MIN_OFFSET (1 << (PP_DMA_INDEX_SHIFT + 8)) +#define PP_DMA_INDEX_BITS ((__builtin_constant_p(PAGE_OFFSET) && \ + PAGE_OFFSET >= PP_DMA_INDEX_MIN_OFFSET && \ + !(PAGE_OFFSET & (PP_DMA_INDEX_MIN_OFFSET - 1))) ? \ + MIN(32, __ffs(PAGE_OFFSET) - PP_DMA_INDEX_SHIFT) : 0) + +#endif + +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ + PP_DMA_INDEX_SHIFT) + +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(const struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(const struct page *page) +{ + return false; +} +#endif + +#define PAGE_SNAPSHOT_FAITHFUL (1 << 0) +#define PAGE_SNAPSHOT_PG_BUDDY (1 << 1) +#define PAGE_SNAPSHOT_PG_IDLE (1 << 2) + +struct page_snapshot { + struct folio folio_snapshot; + struct page page_snapshot; + unsigned long pfn; + unsigned long idx; + unsigned long flags; +}; + +static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps) +{ + return ps->flags & PAGE_SNAPSHOT_FAITHFUL; +} + +void snapshot_page(struct page_snapshot *ps, const struct page *page); + #endif /* _LINUX_MM_H */ |
