diff options
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r-- | include/linux/mm.h | 854 |
1 files changed, 416 insertions, 438 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..1f80baddacc5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -5,6 +5,7 @@ #include <linux/errno.h> #include <linux/mmdebug.h> #include <linux/gfp.h> +#include <linux/pgalloc_tag.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> @@ -30,12 +31,14 @@ #include <linux/kasan.h> #include <linux/memremap.h> #include <linux/slab.h> +#include <linux/cacheinfo.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; +struct folio_batch; extern int sysctl_page_lock_unfairness; @@ -86,7 +89,7 @@ extern int sysctl_legacy_va_layout; #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; -extern const int mmap_rnd_bits_max; +extern int mmap_rnd_bits_max __ro_after_init; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS @@ -95,6 +98,14 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif +#ifndef DIRECT_MAP_PHYSMEM_END +# ifdef MAX_PHYSMEM_BITS +# define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) +# else +# define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) +# endif +#endif + #include <asm/page.h> #include <asm/processor.h> @@ -202,11 +213,11 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; -int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, +int overcommit_ratio_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); -int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, +int overcommit_kbytes_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); -int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, +int overcommit_policy_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -226,7 +237,6 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) -#define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); @@ -320,21 +330,27 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 -# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ -# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */ -# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 -# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 -#ifdef CONFIG_PPC +# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 +# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 +# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 +# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 +#if CONFIG_ARCH_PKEY_BITS > 3 +# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 +#else +# define VM_PKEY_BIT3 0 +#endif +#if CONFIG_ARCH_PKEY_BITS > 4 # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 @@ -352,13 +368,23 @@ extern unsigned int kobjsize(const void *objp); * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 -#else +#endif + +#if defined(CONFIG_ARM64_GCS) +/* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ +# define VM_SHADOW_STACK VM_HIGH_ARCH_6 +#endif + +#ifndef VM_SHADOW_STACK # define VM_SHADOW_STACK VM_NONE #endif #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ -#elif defined(CONFIG_PPC) +#elif defined(CONFIG_PPC64) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 @@ -373,8 +399,8 @@ extern unsigned int kobjsize(const void *objp); #endif #if defined(CONFIG_ARM64_MTE) -# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ -# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ +# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ +# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE @@ -391,6 +417,34 @@ extern unsigned int kobjsize(const void *objp); # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +/* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED_BIT 39 +#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#endif + +#ifdef CONFIG_64BIT +#define VM_DROPPABLE_BIT 40 +#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) +#elif defined(CONFIG_PPC32) +#define VM_DROPPABLE VM_ARCH_1 +#else +#define VM_DROPPABLE VM_NONE +#endif + +#ifdef CONFIG_64BIT +/* VM is sealed, in vm_flags */ +#define VM_SEALED _BITUL(63) +#endif + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) @@ -657,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -674,7 +728,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -689,7 +743,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -697,7 +751,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -708,7 +762,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; @@ -726,7 +780,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } @@ -781,6 +835,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, return NULL; } +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); +} + static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); @@ -973,27 +1032,6 @@ static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) return mas_prev(&vmi->mas, 0); } -static inline -struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) -{ - return mas_prev_range(&vmi->mas, 0); -} - -static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) -{ - return vmi->mas.index; -} - -static inline unsigned long vma_iter_end(struct vma_iterator *vmi) -{ - return vmi->mas.last + 1; -} -static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, - unsigned long count) -{ - return mas_expected_entries(&vmi->mas, count); -} - static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { @@ -1085,7 +1123,7 @@ static inline unsigned int compound_order(struct page *page) * * Return: The order of the folio. */ -static inline unsigned int folio_order(struct folio *folio) +static inline unsigned int folio_order(const struct folio *folio) { if (!folio_test_large(folio)) return 0; @@ -1177,82 +1215,51 @@ static inline int is_vmalloc_or_module_addr(const void *x) /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for - * debugging purposes - it does not include PTE-mapped sub-pages; look - * at folio_mapcount() or page_mapcount() or total_mapcount() instead. + * debugging purposes or implementation of other core folio_*() primitives. */ -static inline int folio_entire_mapcount(struct folio *folio) +static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_entire_mapcount) + 1; } -/* - * The atomic page->_mapcount, starts from -1: so that transitions - * both from it and to it can be tracked, using atomic_inc_and_test - * and atomic_add_negative(-1). - */ -static inline void page_mapcount_reset(struct page *page) +static inline int folio_large_mapcount(const struct folio *folio) { - atomic_set(&(page)->_mapcount, -1); + VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); + return atomic_read(&folio->_large_mapcount) + 1; } /** - * page_mapcount() - Number of times this precise page is mapped. - * @page: The page. + * folio_mapcount() - Number of mappings of this folio. + * @folio: The folio. * - * The number of times this page is mapped. If this page is part of - * a large folio, it includes the number of times this page is mapped - * as part of that folio. + * The folio mapcount corresponds to the number of present user page table + * entries that reference any part of a folio. Each such present user page + * table entry must be paired with exactly on folio reference. * - * The result is undefined for pages which cannot be mapped into userspace. - * For example SLAB or special types of pages. See function page_has_type(). - * They use this field in struct page differently. - */ -static inline int page_mapcount(struct page *page) -{ - int mapcount = atomic_read(&page->_mapcount) + 1; - - if (unlikely(PageCompound(page))) - mapcount += folio_entire_mapcount(page_folio(page)); - - return mapcount; -} - -int folio_total_mapcount(struct folio *folio); - -/** - * folio_mapcount() - Calculate the number of mappings of this folio. - * @folio: The folio. + * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts + * exactly once. * - * A large folio tracks both how many times the entire folio is mapped, - * and how many times each individual page in the folio is mapped. - * This function calculates the total number of times the folio is - * mapped. + * For hugetlb folios, each abstracted "hugetlb" user page table entry that + * references the entire folio counts exactly once, even when such special + * page table entries are comprised of multiple ordinary page table entries. + * + * Will report 0 for pages which cannot be mapped into userspace, such as + * slab, page tables and similar. * * Return: The number of times this folio is mapped. */ -static inline int folio_mapcount(struct folio *folio) +static inline int folio_mapcount(const struct folio *folio) { - if (likely(!folio_test_large(folio))) - return atomic_read(&folio->_mapcount) + 1; - return folio_total_mapcount(folio); -} + int mapcount; -static inline int total_mapcount(struct page *page) -{ - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) + 1; - return folio_total_mapcount(page_folio(page)); -} - -static inline bool folio_large_is_mapped(struct folio *folio) -{ - /* - * Reading _entire_mapcount below could be omitted if hugetlb - * participated in incrementing nr_pages_mapped when compound mapped. - */ - return atomic_read(&folio->_nr_pages_mapped) > 0 || - atomic_read(&folio->_entire_mapcount) >= 0; + if (likely(!folio_test_large(folio))) { + mapcount = atomic_read(&folio->_mapcount) + 1; + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + return mapcount; + } + return folio_large_mapcount(folio); } /** @@ -1261,11 +1268,9 @@ static inline bool folio_large_is_mapped(struct folio *folio) * * Return: True if any page in this folio is referenced by user page tables. */ -static inline bool folio_mapped(struct folio *folio) +static inline bool folio_mapped(const struct folio *folio) { - if (likely(!folio_test_large(folio))) - return atomic_read(&folio->_mapcount) >= 0; - return folio_large_is_mapped(folio); + return folio_mapcount(folio) >= 1; } /* @@ -1273,11 +1278,9 @@ static inline bool folio_mapped(struct folio *folio) * For compound page it returns true if any sub-page of compound page is mapped, * even if this particular sub-page is not itself mapped by any PTE or PMD. */ -static inline bool page_mapped(struct page *page) +static inline bool page_mapped(const struct page *page) { - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) >= 0; - return folio_large_is_mapped(page_folio(page)); + return folio_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) @@ -1296,15 +1299,12 @@ static inline struct folio *virt_to_folio(const void *x) void __folio_put(struct folio *folio); -void put_pages_list(struct list_head *pages); - void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); +int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -void destroy_large_folio(struct folio *folio); - /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { @@ -1422,27 +1422,22 @@ vm_fault_t finish_fault(struct vm_fault *vmf); #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -bool __put_devmap_managed_page_refs(struct page *page, int refs); -static inline bool put_devmap_managed_page_refs(struct page *page, int refs) +bool __put_devmap_managed_folio_refs(struct folio *folio, int refs); +static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) { if (!static_branch_unlikely(&devmap_managed_key)) return false; - if (!is_zone_device_page(page)) + if (!folio_is_zone_device(folio)) return false; - return __put_devmap_managed_page_refs(page, refs); + return __put_devmap_managed_folio_refs(folio, refs); } #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ -static inline bool put_devmap_managed_page_refs(struct page *page, int refs) +static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) { return false; } #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ -static inline bool put_devmap_managed_page(struct page *page) -{ - return put_devmap_managed_page_refs(page, 1); -} - /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) @@ -1463,7 +1458,10 @@ static inline void folio_get(struct folio *folio) static inline void get_page(struct page *page) { - folio_get(page_folio(page)); + struct folio *folio = page_folio(page); + if (WARN_ON_ONCE(folio_test_slab(folio))) + return; + folio_get(folio); } static inline __must_check bool try_get_page(struct page *page) @@ -1514,6 +1512,8 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } +void folios_put_refs(struct folio_batch *folios, unsigned int *refs); + /* * union release_pages_arg - an array of pages or folios * @@ -1536,29 +1536,33 @@ void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. - * @nr: How many folios there are. * - * Like folio_put(), but for an array of folios. This is more efficient - * than writing the loop yourself as it will optimise the locks which - * need to be taken if the folios are freed. + * Like folio_put(), but for a batch of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which need + * to be taken if the folios are freed. The folios batch is returned + * empty and ready to be reused for another batch; there is no need to + * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ -static inline void folios_put(struct folio **folios, unsigned int nr) +static inline void folios_put(struct folio_batch *folios) { - release_pages(folios, nr); + folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); + if (folio_test_slab(folio)) + return; + /* * For some devmap managed pages we need to catch refcount transition * from 2 to 1: */ - if (put_devmap_managed_page(&folio->page)) + if (put_devmap_managed_folio_refs(folio, 1)) return; folio_put(folio); } @@ -1590,17 +1594,20 @@ static inline void put_page(struct page *page) * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() - * provides safe operation for get_user_pages(), page_mkclean() and + * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); +void unpin_folio(struct folio *folio); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); +void unpin_user_folio(struct folio *folio, unsigned long npages); +void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) { @@ -1640,13 +1647,11 @@ static inline int page_zone_id(struct page *page) } #ifdef NODE_NOT_IN_PAGE_FLAGS -extern int page_to_nid(const struct page *page); +int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { - struct page *p = (struct page *)page; - - return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK; + return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif @@ -1750,6 +1755,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } + +bool folio_use_access_time(struct folio *folio); #else /* !CONFIG_NUMA_BALANCING */ static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { @@ -1803,6 +1810,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { } +static inline bool folio_use_access_time(struct folio *folio) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) @@ -1901,7 +1912,7 @@ static inline unsigned long page_to_section(const struct page *page) * * Return: The Page Frame Number of the first page in the folio. */ -static inline unsigned long folio_pfn(struct folio *folio) +static inline unsigned long folio_pfn(const struct folio *folio) { return page_to_pfn(&folio->page); } @@ -1933,8 +1944,8 @@ static inline struct folio *pfn_folio(unsigned long pfn) * * For more information, please see Documentation/core-api/pin_user_pages.rst. * - * Return: True, if it is likely that the page has been "dma-pinned". - * False, if the page is definitely not dma-pinned. + * Return: True, if it is likely that the folio has been "dma-pinned". + * False, if the folio is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { @@ -1953,11 +1964,6 @@ static inline bool folio_maybe_dma_pinned(struct folio *folio) GUP_PIN_COUNTING_BIAS; } -static inline bool page_maybe_dma_pinned(struct page *page) -{ - return folio_maybe_dma_pinned(page_folio(page)); -} - /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. @@ -2054,7 +2060,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone, * * Return: A positive power of two. */ -static inline long folio_nr_pages(struct folio *folio) +static inline long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; @@ -2065,6 +2071,13 @@ static inline long folio_nr_pages(struct folio *folio) #endif } +/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) +#else +#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES +#endif + /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to @@ -2123,7 +2136,7 @@ static inline struct folio *folio_next(struct folio *folio) * it from being split. It is not necessary for the folio to be locked. * Return: The base-2 logarithm of the size of this folio. */ -static inline unsigned int folio_shift(struct folio *folio) +static inline unsigned int folio_shift(const struct folio *folio) { return PAGE_SHIFT + folio_order(folio); } @@ -2136,49 +2149,78 @@ static inline unsigned int folio_shift(struct folio *folio) * it from being split. It is not necessary for the folio to be locked. * Return: The number of bytes in this folio. */ -static inline size_t folio_size(struct folio *folio) +static inline size_t folio_size(const struct folio *folio) { return PAGE_SIZE << folio_order(folio); } /** - * folio_estimated_sharers - Estimate the number of sharers of a folio. + * folio_likely_mapped_shared - Estimate if the folio is mapped into the page + * tables of more than one MM * @folio: The folio. * - * folio_estimated_sharers() aims to serve as a function to efficiently - * estimate the number of processes sharing a folio. This is done by - * looking at the precise mapcount of the first subpage in the folio, and - * assuming the other subpages are the same. This may not be true for large - * folios. If you want exact mapcounts for exact calculations, look at - * page_mapcount() or folio_total_mapcount(). + * This function checks if the folio is currently mapped into more than one + * MM ("mapped shared"), or if the folio is only mapped into a single MM + * ("mapped exclusively"). + * + * For KSM folios, this function also returns "mapped shared" when a folio is + * mapped multiple times into the same MM, because the individual page mappings + * are independent. + * + * As precise information is not easily available for all folios, this function + * estimates the number of MMs ("sharers") that are currently mapping a folio + * using the number of times the first page of the folio is currently mapped + * into page tables. * - * Return: The estimated number of processes sharing a folio. + * For small anonymous folios and anonymous hugetlb folios, the return + * value will be exactly correct: non-KSM folios can only be mapped at most once + * into an MM, and they cannot be partially mapped. KSM folios are + * considered shared even if mapped multiple times into the same MM. + * + * For other folios, the result can be fuzzy: + * #. For partially-mappable large folios (THP), the return value can wrongly + * indicate "mapped exclusively" (false negative) when the folio is + * only partially mapped into at least one MM. + * #. For pagecache folios (including hugetlb), the return value can wrongly + * indicate "mapped shared" (false positive) when two VMAs in the same MM + * cover the same file range. + * + * Further, this function only considers current page table mappings that + * are tracked using the folio mapcount(s). + * + * This function does not consider: + * #. If the folio might get mapped in the (near) future (e.g., swapcache, + * pagecache, temporary unmapping for migration). + * #. If the folio is mapped differently (VM_PFNMAP). + * #. If hugetlb page table sharing applies. Callers might want to check + * hugetlb_pmd_shared(). + * + * Return: Whether the folio is estimated to be mapped into more than one MM. */ -static inline int folio_estimated_sharers(struct folio *folio) +static inline bool folio_likely_mapped_shared(struct folio *folio) { - return page_mapcount(folio_page(folio, 0)); -} + int mapcount = folio_mapcount(folio); -#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE -static inline int arch_make_page_accessible(struct page *page) -{ - return 0; + /* Only partially-mappable folios require more care. */ + if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) + return mapcount > 1; + + /* A single mapping implies "mapped exclusively". */ + if (mapcount <= 1) + return false; + + /* If any page is mapped more than once we treat it "mapped shared". */ + if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio)) + return true; + + /* Let's guess based on the first subpage. */ + return atomic_read(&folio->_mapcount) > 0; } -#endif #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { - int ret; - long i, nr = folio_nr_pages(folio); - - for (i = 0; i < nr; i++) { - ret = arch_make_page_accessible(folio_page(folio, i)); - if (ret) - break; - } - - return ret; + return 0; } #endif @@ -2187,11 +2229,6 @@ static inline int arch_make_folio_accessible(struct folio *folio) */ #include <linux/vmstat.h> -static __always_inline void *lowmem_page_address(const struct page *page) -{ - return page_to_virt(page); -} - #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif @@ -2214,6 +2251,11 @@ void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif +static __always_inline void *lowmem_page_address(const struct page *page) +{ + return page_to_virt(page); +} + #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) @@ -2225,19 +2267,6 @@ static inline void *folio_address(const struct folio *folio) return page_address(&folio->page); } -extern pgoff_t __page_file_index(struct page *page); - -/* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use swp_offset(->private) - */ -static inline pgoff_t page_index(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return __page_file_index(page); - return page->index; -} - /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not @@ -2297,6 +2326,7 @@ extern void pagefault_out_of_memory(void); struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ + bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; @@ -2371,15 +2401,40 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp); -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn); -int follow_phys(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); +struct follow_pfnmap_args { + /** + * Inputs: + * @vma: Pointer to @vm_area_struct struct + * @address: the virtual address to walk + */ + struct vm_area_struct *vma; + unsigned long address; + /** + * Internals: + * + * The caller shouldn't touch any of these. + */ + spinlock_t *lock; + pte_t *ptep; + /** + * Outputs: + * + * @pfn: the PFN of the address + * @pgprot: the pgprot_t of the mapping + * @writable: whether the mapping is writable + * @special: whether the mapping is a special mapping (real PFN maps) + */ + unsigned long pfn; + pgprot_t pgprot; + bool writable; + bool special; +}; +int follow_pfnmap_start(struct follow_pfnmap_args *args); +void follow_pfnmap_end(struct follow_pfnmap_args *args); + extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); @@ -2484,6 +2539,10 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, + struct folio **folios, unsigned int max_folios, + pgoff_t *offset); +int folio_add_pins(struct folio *folio, unsigned int pins); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); @@ -2499,16 +2558,12 @@ struct kvec; struct page *get_dump_page(unsigned long addr); bool folio_mark_dirty(struct folio *folio); +bool folio_mark_dirty_lock(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); -extern unsigned long move_page_tables(struct vm_area_struct *vma, - unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len, - bool need_rmap_locks, bool for_stack); - /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However @@ -2529,21 +2584,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) -bool vma_needs_dirty_tracking(struct vm_area_struct *vma); -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); -static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) -{ - /* - * We want to check manually if we can change individual PTEs writable - * if we can't do that automatically for all PTEs in a mapping. For - * private mappings, that's always the case when we have write - * permissions as we properly have to handle COW. - */ - if (vma->vm_flags & VM_SHARED) - return vma_wants_writenotify(vma, vma->vm_page_prot); - return !!(vma->vm_flags & VM_WRITE); - -} bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern long change_protection(struct mmu_gather *tlb, @@ -2595,19 +2635,19 @@ static inline void dec_mm_counter(struct mm_struct *mm, int member) mm_trace_rss_stat(mm, member); } -/* Optimized variant when page is already known not to be PageAnon */ -static inline int mm_counter_file(struct page *page) +/* Optimized variant when folio is already known not to be anon */ +static inline int mm_counter_file(struct folio *folio) { - if (PageSwapBacked(page)) + if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } -static inline int mm_counter(struct page *page) +static inline int mm_counter(struct folio *folio) { - if (PageAnon(page)) + if (folio_test_anon(folio)) return MM_ANONPAGES; - return mm_counter_file(page); + return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) @@ -2667,6 +2707,30 @@ static inline pte_t pte_mkspecial(pte_t pte) } #endif +#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return false; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return false; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud; +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { @@ -2837,12 +2901,13 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt) * * Return: The ptdesc describing the allocated page tables. */ -static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order) +static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) { - struct page *page = alloc_pages(gfp | __GFP_COMP, order); + struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); return page_ptdesc(page); } +#define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) /** * pagetable_free - Free pagetables @@ -2858,7 +2923,7 @@ static inline void pagetable_free(struct ptdesc *pt) __free_pages(page, compound_order(page)); } -#if USE_SPLIT_PTE_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); @@ -2893,6 +2958,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); + BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); + return ptlock_ptr(virt_to_ptdesc(pte)); +} + static inline bool ptlock_init(struct ptdesc *ptdesc) { /* @@ -2909,7 +2981,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) return true; } -#else /* !USE_SPLIT_PTE_PTLOCKS */ +#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ @@ -2917,23 +2989,24 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } +static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) +{ + return &mm->page_table_lock; +} static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} -#endif /* USE_SPLIT_PTE_PTLOCKS */ +#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) +static inline void __pagetable_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); - if (!ptlock_init(ptdesc)) - return false; __folio_set_pgtable(folio); lruvec_stat_add_folio(folio, NR_PAGETABLE); - return true; } -static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) +static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -2942,7 +3015,29 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline void pagetable_dtor_free(struct ptdesc *ptdesc) +{ + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} + +static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) +{ + if (!ptlock_init(ptdesc)) + return false; + __pagetable_ctor(ptdesc); + return true; +} + +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, + pmd_t *pmdvalp) +{ + pte_t *pte; + + __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); + return pte; +} static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); @@ -2955,12 +3050,16 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; - __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)); + __cond_lock(RCU, __cond_lock(*ptlp, + pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } -pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp); +pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); +pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdvalp, + spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ @@ -2980,7 +3079,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) -#if USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_SPLIT_PMD_PTLOCKS) static inline struct page *pmd_pgtable_page(pmd_t *pmd) { @@ -3006,14 +3105,6 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) return ptlock_init(ptdesc); } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); -#endif - ptlock_free(ptdesc); -} - #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else @@ -3024,7 +3115,6 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -3039,24 +3129,13 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - if (!pmd_ptlock_init(ptdesc)) return false; - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + ptdesc_pmd_pts_init(ptdesc); + __pagetable_ctor(ptdesc); return true; } -static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - pmd_ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be @@ -3078,18 +3157,17 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); } -static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) +static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + __pagetable_ctor(ptdesc); +} - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); +static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); @@ -3110,13 +3188,7 @@ extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ -static inline void free_reserved_page(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - adjust_managed_page_count(page, 1); -} +void free_reserved_page(struct page *page); #define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) @@ -3173,8 +3245,6 @@ static inline unsigned long get_num_physpages(void) */ void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); -unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, - unsigned long end_pfn); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, @@ -3190,7 +3260,6 @@ static inline int early_pfn_to_nid(unsigned long pfn) extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif -extern void set_dma_reserve(unsigned long new_dma_reserve); extern void mem_init(void); extern void __init mmap_init(void); @@ -3202,9 +3271,6 @@ static inline void show_mem(void) extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); -#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES -extern unsigned long arch_reserved_kernel_pages(void); -#endif extern __printf(3, 4) void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); @@ -3251,78 +3317,11 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next); -extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff); -extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void unlink_file_vma(struct vm_area_struct *); -extern struct vm_area_struct *copy_vma(struct vm_area_struct **, - unsigned long addr, unsigned long len, pgoff_t pgoff, - bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); -struct vm_area_struct *vma_modify(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long vm_flags, - struct mempolicy *policy, - struct vm_userfaultfd_ctx uffd_ctx, - struct anon_vma_name *anon_name); - -/* We are about to modify the VMA's flags. */ -static inline struct vm_area_struct -*vma_modify_flags(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long new_flags) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); -} - -/* We are about to modify the VMA's flags and/or anon_name. */ -static inline struct vm_area_struct -*vma_modify_flags_name(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long new_flags, - struct anon_vma_name *new_name) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); -} - -/* We are about to modify the VMA's memory policy. */ -static inline struct vm_area_struct -*vma_modify_policy(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct mempolicy *new_pol) -{ - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); -} - -/* We are about to modify the VMA's flags and/or uffd context. */ -static inline struct vm_area_struct -*vma_modify_flags_uffd(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), new_ctx, anon_vma_name(vma)); -} +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, @@ -3355,19 +3354,21 @@ extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, const struct vm_special_mapping *spec); -/* This is an obsolete alternative to _install_special_mapping. */ -extern int install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long flags, struct page **pages); unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +unsigned long +__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); + +static inline unsigned long +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return __get_unmapped_area(file, addr, len, pgoff, flags, 0); +} -extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, @@ -3375,14 +3376,14 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); +int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU -extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *uf, bool unlock); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) @@ -3409,6 +3410,7 @@ struct vm_unmapped_area_info { unsigned long high_limit; unsigned long align_mask; unsigned long align_offset; + unsigned long start_gap; }; extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); @@ -3430,9 +3432,6 @@ extern unsigned long stack_guard_gap; int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); -/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address); - /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, @@ -3609,9 +3608,6 @@ static inline vm_fault_t vmf_fs_error(int err) return VM_FAULT_SIGBUS; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags); - static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) @@ -3765,24 +3761,22 @@ static inline bool page_is_guard(struct page *page) return PageGuard(page); } -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype); +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) + unsigned int order) { if (!debug_guardpage_enabled()) return false; - return __set_page_guard(zone, page, order, migratetype); + return __set_page_guard(zone, page, order); } -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order, - int migratetype); +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) + unsigned int order) { if (!debug_guardpage_enabled()) return; - __clear_page_guard(zone, page, order, migratetype); + __clear_page_guard(zone, page, order); } #else /* CONFIG_DEBUG_PAGEALLOC */ @@ -3792,9 +3786,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) { return false; } + unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, - unsigned int order, int migratetype) {} + unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA @@ -3817,7 +3811,7 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, +int drop_caches_sysctl_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); #endif @@ -3842,8 +3836,6 @@ void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); -void pmd_init(void *addr); -void pud_init(void *addr); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); @@ -3949,7 +3941,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); -extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE @@ -3962,7 +3953,6 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); -struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); #else static inline void memory_failure_queue(unsigned long pfn, int flags) { @@ -3983,12 +3973,6 @@ static inline void num_poisoned_pages_sub(unsigned long pfn, long i) } #endif -#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_KSM) -void add_to_kill_ksm(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr); -#endif - #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); @@ -4029,10 +4013,10 @@ enum mf_result { enum mf_action_page_type { MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER, - MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, + MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, @@ -4046,13 +4030,12 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, + MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) -extern void clear_huge_page(struct page *page, - unsigned long addr_hint, - unsigned int pages_per_huge_page); +void folio_zero_user(struct folio *folio, unsigned long addr_hint); int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma); @@ -4105,45 +4088,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, extern int sysctl_nr_trim_pages; -#ifdef CONFIG_PRINTK -void mem_dump_obj(void *object); -#else -static inline void mem_dump_obj(void *object) {} -#endif - -/** - * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and - * handle them. - * @seals: the seals to check - * @vma: the vma to operate on - * - * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper - * check/handling on the vma flags. Return 0 if check pass, or <0 for errors. - */ -static inline int seal_check_write(int seals, struct vm_area_struct *vma) -{ - if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vm_flags_clear(vma, VM_MAYWRITE); - } - - return 0; -} - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, @@ -4158,18 +4102,18 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, #ifdef CONFIG_UNACCEPTED_MEMORY -bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end); -void accept_memory(phys_addr_t start, phys_addr_t end); +bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); +void accept_memory(phys_addr_t start, unsigned long size); #else static inline bool range_contains_unaccepted_memory(phys_addr_t start, - phys_addr_t end) + unsigned long size) { return false; } -static inline void accept_memory(phys_addr_t start, phys_addr_t end) +static inline void accept_memory(phys_addr_t start, unsigned long size) { } @@ -4177,9 +4121,43 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end) static inline bool pfn_is_unaccepted_memory(unsigned long pfn) { - phys_addr_t paddr = pfn << PAGE_SHIFT; + return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); +} + +void vma_pgtable_walk_begin(struct vm_area_struct *vma); +void vma_pgtable_walk_end(struct vm_area_struct *vma); + +int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); + +#ifdef CONFIG_64BIT +int do_mseal(unsigned long start, size_t len_in, unsigned long flags); +#else +static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags) +{ + /* noop on 32 bit */ + return 0; +} +#endif - return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE); +/* + * user_alloc_needs_zeroing checks if a user folio from page allocator needs to + * be zeroed or not. + */ +static inline bool user_alloc_needs_zeroing(void) +{ + /* + * for user folios, arch with cache aliasing requires cache flush and + * arc changes folio->flags to make icache coherent with dcache, so + * always return false to make caller use + * clear_user_page()/clear_user_highpage(). + */ + return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || + !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); } +int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); +int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); +int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); + #endif /* _LINUX_MM_H */ |