diff options
Diffstat (limited to 'include')
48 files changed, 1197 insertions, 633 deletions
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index a57d667addd2..d7f6335d3999 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -25,6 +25,13 @@ static inline pte_t huge_pte_mkwrite(pte_t pte) return pte_mkwrite(pte); } +#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT +static inline pte_t huge_pte_wrprotect(pte_t pte) +{ + return pte_wrprotect(pte); +} +#endif + static inline pte_t huge_pte_mkdirty(pte_t pte) { return pte_mkdirty(pte); @@ -37,7 +44,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { - return pte_mkuffd_wp(pte); + return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) @@ -104,13 +111,6 @@ static inline int huge_pte_none_mostly(pte_t pte) return huge_pte_none(pte) || is_pte_marker(pte); } -#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} -#endif - #ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index a2c8ed60233a..6796abe1900e 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -19,6 +19,18 @@ #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) +#ifndef pfn_valid +static inline int pfn_valid(unsigned long pfn) +{ + /* avoid <linux/mm.h> include hell */ + extern unsigned long max_mapnr; + unsigned long pfn_offset = ARCH_PFN_OFFSET; + + return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr; +} +#define pfn_valid pfn_valid +#endif + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) /* memmap is virtually contiguous. */ diff --git a/include/asm-generic/page.h b/include/asm-generic/page.h index 6fc47561814c..c0be2edeb484 100644 --- a/include/asm-generic/page.h +++ b/include/asm-generic/page.h @@ -84,8 +84,6 @@ extern unsigned long memory_end; #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) #endif -#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) - #define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ ((void *)(kaddr) < (void *)memory_end)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b9637d63e6f0..41a41561b773 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -556,6 +556,7 @@ struct request_queue { #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ +#define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ @@ -1253,6 +1254,12 @@ static inline bool bdev_nonrot(struct block_device *bdev) return blk_queue_nonrot(bdev_get_queue(bdev)); } +static inline bool bdev_synchronous(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_SYNCHRONOUS, + &bdev_get_queue(bdev)->queue_flags); +} + static inline bool bdev_stable_writes(struct block_device *bdev) { return test_bit(QUEUE_FLAG_STABLE_WRITES, @@ -1397,7 +1404,6 @@ struct block_device_operations { unsigned int flags); int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, enum req_op); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, @@ -1432,10 +1438,6 @@ extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t, #define blkdev_compat_ptr_ioctl NULL #endif -extern int bdev_read_page(struct block_device *, sector_t, struct page *); -extern int bdev_write_page(struct block_device *, sector_t, struct page *, - struct writeback_control *); - static inline void blk_wake_io_task(struct task_struct *waiter) { /* diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 33fa5e94aa80..8f14dca5fed7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -61,7 +61,10 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); struct buffer_head { unsigned long b_state; /* buffer state bitmap (see above) */ struct buffer_head *b_this_page;/* circular list of page's buffers */ - struct page *b_page; /* the page this bh is mapped to */ + union { + struct page *b_page; /* the page this bh is mapped to */ + struct folio *b_folio; /* the folio this bh is mapped to */ + }; sector_t b_blocknr; /* start block number */ size_t b_size; /* size of mapping */ diff --git a/include/linux/damon.h b/include/linux/damon.h index ad15a5b88e3a..d5d4d19928e0 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -8,6 +8,7 @@ #ifndef _DAMON_H_ #define _DAMON_H_ +#include <linux/memcontrol.h> #include <linux/mutex.h> #include <linux/time64.h> #include <linux/types.h> @@ -90,6 +91,12 @@ struct damon_target { * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions + * + * The support of each action is up to running &struct damon_operations. + * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except + * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR + * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum + * DAMOS_LRU_DEPRIO, and &DAMOS_STAT. */ enum damos_action { DAMOS_WILLNEED, @@ -216,6 +223,44 @@ struct damos_stat { }; /** + * enum damos_filter_type - Type of memory for &struct damos_filter + * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. + * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. + * @NR_DAMOS_FILTER_TYPES: Number of filter types. + * + * The support of each filter type is up to running &struct damon_operations. + * &enum DAMON_OPS_PADDR is supporting all filter types, while + * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any + * filter types. + */ +enum damos_filter_type { + DAMOS_FILTER_TYPE_ANON, + DAMOS_FILTER_TYPE_MEMCG, + NR_DAMOS_FILTER_TYPES, +}; + +/** + * struct damos_filter - DAMOS action target memory filter. + * @type: Type of the page. + * @matching: If the matching page should filtered out or in. + * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. + * @list: List head for siblings. + * + * Before applying the &damos->action to a memory region, DAMOS checks if each + * page of the region matches to this and avoid applying the action if so. + * Note that the check support is up to &struct damon_operations + * implementation. + */ +struct damos_filter { + enum damos_filter_type type; + bool matching; + union { + unsigned short memcg_id; + }; + struct list_head list; +}; + +/** * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. * @max_sz_region: Maximum size of target regions. @@ -239,6 +284,7 @@ struct damos_access_pattern { * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. + * @filters: Additional set of &struct damos_filter for &action. * @stat: Statistics of this scheme. * @list: List head for siblings. * @@ -254,6 +300,10 @@ struct damos_access_pattern { * If all schemes that registered to a &struct damon_ctx are inactive, DAMON * stops monitoring and just repeatedly checks the watermarks. * + * Before applying the &action to a memory region, &struct damon_operations + * implementation could check pages of the region and skip &action to respect + * &filters + * * After applying the &action to each region, &stat_count and &stat_sz is * updated to reflect the number of regions and total size of regions that the * &action is applied. @@ -263,6 +313,7 @@ struct damos { enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; + struct list_head filters; struct damos_stat stat; struct list_head list; }; @@ -303,10 +354,10 @@ struct damon_ctx; * users should register the low level operations for their target address * space and usecase via the &damon_ctx.ops. Then, the monitoring thread * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting - * the monitoring, @update after each &damon_ctx.ops_update_interval, and + * the monitoring, @update after each &damon_attrs.ops_update_interval, and * @check_accesses, @target_valid and @prepare_access_checks after each - * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each - * &damon_ctx.aggr_interval. + * &damon_attrs.sample_interval. Finally, @reset_aggregated is called after + * each &damon_attrs.aggr_interval. * * Each &struct damon_operations instance having valid @id can be registered * via damon_register_ops() and selected by damon_select_ops() later. @@ -516,6 +567,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damon_for_each_scheme_safe(s, next, ctx) \ list_for_each_entry_safe(s, next, &(ctx)->schemes, list) +#define damos_for_each_filter(f, scheme) \ + list_for_each_entry(f, &(scheme)->filters, list) + +#define damos_for_each_filter_safe(f, next, scheme) \ + list_for_each_entry_safe(f, next, &(scheme)->filters, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); @@ -536,6 +593,11 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); +struct damos_filter *damos_new_filter(enum damos_filter_type type, + bool matching); +void damos_add_filter(struct damos *s, struct damos_filter *f); +void damos_destroy_filter(struct damos_filter *f); + struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, struct damos_quota *quota, struct damos_watermarks *wmarks); diff --git a/include/linux/fs.h b/include/linux/fs.h index d46ae1e525fc..c85916e9f7db 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -166,6 +166,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) +#define FMODE_NOREUSE ((__force fmode_t)0x800000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index d88c46ca82e1..5088637fe5c2 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 44242268f53b..b06254e76d99 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -86,8 +86,8 @@ static inline void kmap_flush_unused(void); * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * - * While it is significantly faster than kmap() for the higmem case it - * comes with restrictions about the pointer validity. + * While kmap_local_page() is significantly faster than kmap() for the highmem + * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across @@ -119,9 +119,8 @@ static inline void *kmap_local_page(struct page *page); * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * - * While it is significantly faster than kmap() for the higmem case it - * comes with restrictions about the pointer validity. Only use when really - * necessary. + * While it is significantly faster than kmap() for the highmem case it + * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across @@ -208,31 +207,30 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) } #endif -#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#ifndef vma_alloc_zeroed_movable_folio /** - * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move - * @vma: The VMA the page is to be allocated for - * @vaddr: The virtual address the page will be inserted into + * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. + * @vma: The VMA the page is to be allocated for. + * @vaddr: The virtual address the page will be inserted into. * - * Returns: The allocated and zeroed HIGHMEM page + * This function will allocate a page suitable for inserting into this + * VMA at this virtual address. It may be allocated from highmem or + * the movable zone. An architecture may provide its own implementation. * - * This function will allocate a page for a VMA that the caller knows will - * be able to migrate in the future using move_pages() or reclaimed - * - * An architecture may override this function by defining - * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE and providing their own - * implementation. + * Return: A folio containing one allocated and zeroed page or NULL if + * we are out of memory. */ -static inline struct page * -alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, +static inline +struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); + struct folio *folio; - if (page) - clear_user_highpage(page, vaddr); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false); + if (folio) + clear_user_highpage(&folio->page, vaddr); - return page; + return folio; } #endif @@ -416,6 +414,36 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) } /** + * memcpy_from_file_folio - Copy some bytes from a file folio. + * @to: The destination buffer. + * @folio: The folio to copy from. + * @pos: The position in the file. + * @len: The maximum number of bytes to copy. + * + * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE + * if the folio comes from HIGHMEM, and by the size of the folio. + * + * Return: The number of bytes copied from the folio. + */ +static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, + loff_t pos, size_t len) +{ + size_t offset = offset_in_folio(folio, pos); + char *from = kmap_local_folio(folio, offset); + + if (folio_test_highmem(folio)) { + offset = offset_in_page(offset); + len = min_t(size_t, len, PAGE_SIZE - offset); + } else + len = min(len, folio_size(folio) - offset); + + memcpy(to, from, len); + kunmap_local(from); + + return len; +} + +/** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. * @start1: The first byte to zero. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a1341fdcf666..70bd867eba94 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -187,7 +187,7 @@ static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } -void deferred_split_huge_page(struct page *page); +void deferred_split_folio(struct folio *folio); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); @@ -293,15 +293,6 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } -static inline struct list_head *page_deferred_list(struct page *page) -{ - /* - * See organization of tail pages of compound page in - * "struct page" definition. - */ - return &page[2].deferred_list; -} - #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -349,7 +340,7 @@ static inline int split_huge_page(struct page *page) { return 0; } -static inline void deferred_split_huge_page(struct page *page) {} +static inline void deferred_split_folio(struct folio *folio) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9ab9d3105d5c..7c977d234aba 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -2,6 +2,7 @@ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H +#include <linux/mm.h> #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/fs.h> @@ -170,11 +171,11 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct page *page, struct list_head *list); -int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); +bool isolate_hugetlb(struct folio *folio, struct list_head *list); +int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void putback_active_hugepage(struct page *page); +void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); @@ -193,6 +194,43 @@ extern struct list_head huge_boot_pages; pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); +/* + * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. + * Returns the pte_t* if found, or NULL if the address is not mapped. + * + * IMPORTANT: we should normally not directly call this function, instead + * this is only a common interface to implement arch-specific + * walker. Please use hugetlb_walk() instead, because that will attempt to + * verify the locking for you. + * + * Since this function will walk all the pgtable pages (including not only + * high-level pgtable page, but also PUD entry that can be unshared + * concurrently for VM_SHARED), the caller of this function should be + * responsible of its thread safety. One can follow this rule: + * + * (1) For private mappings: pmd unsharing is not possible, so holding the + * mmap_lock for either read or write is sufficient. Most callers + * already hold the mmap_lock, so normally, no special action is + * required. + * + * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged + * pgtable page can go away from under us! It can be done by a pmd + * unshare with a follow up munmap() on the other process), then we + * need either: + * + * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare + * won't happen upon the range (it also makes sure the pte_t we + * read is the right and stable one), or, + * + * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make + * sure even if unshare happened the racy unmap() will wait until + * i_mmap_rwsem is released. + * + * Option (2.1) is the safest, which guarantees pte stability from pmd + * sharing pov, until the vma lock released. Option (2.2) doesn't protect + * a concurrent pmd unshare, but it makes sure the pgtable page is safe to + * access. + */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); @@ -211,7 +249,7 @@ void hugetlb_vma_lock_release(struct kref *kref); int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); -unsigned long hugetlb_change_protection(struct vm_area_struct *vma, +long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); @@ -375,12 +413,12 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct page *page, struct list_head *list) +static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) { - return -EBUSY; + return false; } -static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) +static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { return 0; } @@ -391,7 +429,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void putback_active_hugepage(struct page *page) +static inline void folio_putback_active_hugetlb(struct folio *folio) { } @@ -400,7 +438,7 @@ static inline void move_hugetlb_state(struct folio *old_folio, { } -static inline unsigned long hugetlb_change_protection( +static inline long hugetlb_change_protection( struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -679,16 +717,16 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page); + unsigned long address, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); @@ -843,9 +881,9 @@ extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_FAILURE -extern void hugetlb_clear_page_hwpoison(struct page *hpage); +extern void folio_clear_hugetlb_hwpoison(struct folio *folio); #else -static inline void hugetlb_clear_page_hwpoison(struct page *hpage) +static inline void folio_clear_hugetlb_hwpoison(struct folio *folio) { } #endif @@ -998,21 +1036,21 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } -static inline struct page *alloc_huge_page(struct vm_area_struct *vma, +static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { return NULL; } -static inline struct page * -alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +static inline struct folio * +alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { return NULL; } -static inline struct page *alloc_huge_page_vma(struct hstate *h, +static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { @@ -1213,4 +1251,35 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #endif +static inline bool __vma_shareable_lock(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; +} + +/* + * Safe version of huge_pte_offset() to check the locks. See comments + * above huge_pte_offset(). + */ +static inline pte_t * +hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) +{ +#if defined(CONFIG_HUGETLB_PAGE) && \ + defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP) + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + /* + * If pmd sharing possible, locking needed to safely walk the + * hugetlb pgtables. More information can be found at the comment + * above huge_pte_offset() in the same file. + * + * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP. + */ + if (__vma_shareable_lock(vma)) + WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) && + !lockdep_is_held( + &vma->vm_file->f_mapping->i_mmap_rwsem)); +#endif + return huge_pte_offset(vma->vm_mm, addr, sz); +} + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index f706626a8063..3d82d91f49ac 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -141,10 +141,10 @@ extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio); extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, @@ -230,14 +230,14 @@ static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx, static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } static inline void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 2170e0cc279d..5962072a4b19 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1570,8 +1570,6 @@ extern int jbd2_journal_inode_ranged_write(handle_t *handle, extern int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *inode, loff_t start_byte, loff_t length); -extern int jbd2_journal_submit_inode_data_buffers( - struct jbd2_inode *jinode); extern int jbd2_journal_finish_inode_data_buffers( struct jbd2_inode *jinode); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 96c9d56e5510..f7ef70661ce2 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -96,15 +96,6 @@ static inline bool kasan_has_integrated_init(void) } #ifdef CONFIG_KASAN - -struct kasan_cache { -#ifdef CONFIG_KASAN_GENERIC - int alloc_meta_offset; - int free_meta_offset; -#endif - bool is_kmalloc; -}; - void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { @@ -120,19 +111,13 @@ static __always_inline void kasan_poison_pages(struct page *page, __kasan_poison_pages(page, order, init); } -void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); -static __always_inline void kasan_unpoison_pages(struct page *page, +bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); +static __always_inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) - __kasan_unpoison_pages(page, order, init); -} - -void __kasan_cache_create_kmalloc(struct kmem_cache *cache); -static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) -{ - if (kasan_enabled()) - __kasan_cache_create_kmalloc(cache); + return __kasan_unpoison_pages(page, order, init); + return false; } void __kasan_poison_slab(struct slab *slab); @@ -249,9 +234,11 @@ static __always_inline bool kasan_check_byte(const void *addr) static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} -static inline void kasan_unpoison_pages(struct page *page, unsigned int order, - bool init) {} -static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} +static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, + bool init) +{ + return false; +} static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} @@ -302,6 +289,11 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC +struct kasan_cache { + int alloc_meta_offset; + int free_meta_offset; +}; + size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object); slab_flags_t kasan_never_merge(void); void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e594db58a0f1..1fadb5f5978b 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -12,7 +12,6 @@ #include <linux/rcupdate.h> #include <linux/spinlock.h> /* #define CONFIG_MAPLE_RCU_DISABLED */ -/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ /* * Allocated nodes are mutable until they have been inserted into the tree, @@ -433,6 +432,7 @@ struct ma_wr_state { .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ + .mas_flags = 0, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ @@ -456,7 +456,7 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +int mas_preallocate(struct ma_state *mas, gfp_t gfp); bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); @@ -471,6 +471,16 @@ void *mas_next(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); +static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, + unsigned long addr) +{ + memset(mas, 0, sizeof(struct ma_state)); + mas->tree = tree; + mas->index = mas->last = addr; + mas->max = ULONG_MAX; + mas->node = MAS_START; +} + /* Checks if a mas has not found anything */ static inline bool mas_is_none(struct ma_state *mas) { @@ -483,9 +493,6 @@ static inline bool mas_is_paused(struct ma_state *mas) return mas->node == MAS_PAUSE; } -void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); -void mas_dup_store(struct ma_state *mas, void *entry); - /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, @@ -517,7 +524,6 @@ static inline void mas_reset(struct ma_state *mas) * entry. * * Note: may return the zero entry. - * */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) @@ -639,7 +645,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt) } static inline unsigned int mt_height(const struct maple_tree *mt) - { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1e38e99998c7..b6eda2ab205d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -466,34 +466,34 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) } /* - * page_memcg_check - get the memory cgroup associated with a page - * @page: a pointer to the page struct + * folio_memcg_check - Get the memory cgroup associated with a folio. + * @folio: Pointer to the folio. * - * Returns a pointer to the memory cgroup associated with the page, - * or NULL. This function unlike page_memcg() can take any page - * as an argument. It has to be used in cases when it's not known if a page + * Returns a pointer to the memory cgroup associated with the folio, + * or NULL. This function unlike folio_memcg() can take any folio + * as an argument. It has to be used in cases when it's not known if a folio * has an associated memory cgroup pointer or an object cgroups vector or * an object cgroup. * - * For a non-kmem page any of the following ensures page and memcg binding + * For a non-kmem folio any of the following ensures folio and memcg binding * stability: * - * - the page lock + * - the folio lock * - LRU isolation - * - lock_page_memcg() + * - lock_folio_memcg() * - exclusive reference * - mem_cgroup_trylock_pages() * - * For a kmem page a caller should hold an rcu read lock to protect memcg - * associated with a kmem page from being released. + * For a kmem folio a caller should hold an rcu read lock to protect memcg + * associated with a kmem folio from being released. */ -static inline struct mem_cgroup *page_memcg_check(struct page *page) +static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) { /* - * Because page->memcg_data might be changed asynchronously - * for slab pages, READ_ONCE() should be used here. + * Because folio->memcg_data might be changed asynchronously + * for slabs, READ_ONCE() should be used here. */ - unsigned long memcg_data = READ_ONCE(page->memcg_data); + unsigned long memcg_data = READ_ONCE(folio->memcg_data); if (memcg_data & MEMCG_DATA_OBJCGS) return NULL; @@ -508,6 +508,13 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + if (PageTail(page)) + return NULL; + return folio_memcg_check((struct folio *)page); +} + static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) { struct mem_cgroup *memcg; @@ -794,6 +801,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) percpu_ref_put(&objcg->refcnt); } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -878,7 +890,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); +struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) @@ -1165,6 +1177,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) return NULL; } +static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) +{ + return NULL; +} + static inline struct mem_cgroup *page_memcg_check(struct page *page) { return NULL; @@ -1301,6 +1318,11 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } @@ -1760,24 +1782,24 @@ static inline bool memcg_bpf_enabled(void) return static_branch_likely(&memcg_bpf_enabled_key); } -extern struct static_key_false memcg_kmem_enabled_key; +extern struct static_key_false memcg_kmem_online_key; -static inline bool memcg_kmem_enabled(void) +static inline bool memcg_kmem_online(void) { - return static_branch_likely(&memcg_kmem_enabled_key); + return static_branch_likely(&memcg_kmem_online_key); } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) return __memcg_kmem_charge_page(page, gfp, order); return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { - if (memcg_kmem_enabled()) + if (memcg_kmem_online()) __memcg_kmem_uncharge_page(page, order); } @@ -1798,7 +1820,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { struct mem_cgroup *memcg; - if (!memcg_kmem_enabled()) + if (!memcg_kmem_online()) return; rcu_read_lock(); @@ -1843,7 +1865,7 @@ static inline bool memcg_bpf_enabled(void) return false; } -static inline bool memcg_kmem_enabled(void) +static inline bool memcg_kmem_online(void) { return false; } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3ef77f52a4f0..6241a1596a75 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -18,6 +18,7 @@ struct migration_target_control; * - zero on page migration success; */ #define MIGRATEPAGE_SUCCESS 0 +#define MIGRATEPAGE_UNMAP 1 /** * struct movable_operations - Driver page migration @@ -61,16 +62,16 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION -extern void putback_movable_pages(struct list_head *l); +void putback_movable_pages(struct list_head *l); int migrate_folio_extra(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); -extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, - unsigned long private, enum migrate_mode mode, int reason, - unsigned int *ret_succeeded); -extern struct page *alloc_migration_target(struct page *page, unsigned long private); -extern int isolate_movable_page(struct page *page, isolate_mode_t mode); +int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, + unsigned long private, enum migrate_mode mode, int reason, + unsigned int *ret_succeeded); +struct page *alloc_migration_target(struct page *page, unsigned long private); +bool isolate_movable_page(struct page *page, isolate_mode_t mode); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); @@ -91,8 +92,8 @@ static inline int migrate_pages(struct list_head *l, new_page_t new, static inline struct page *alloc_migration_target(struct page *page, unsigned long private) { return NULL; } -static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) - { return -EBUSY; } +static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode) + { return false; } static inline int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) @@ -123,6 +124,15 @@ static inline bool folio_test_movable(struct folio *folio) } static inline +const struct movable_operations *folio_movable_ops(struct folio *folio) +{ + VM_BUG_ON(!__folio_test_movable(folio)); + + return (const struct movable_operations *) + ((unsigned long)folio->mapping - PAGE_MAPPING_MOVABLE); +} + +static inline const struct movable_operations *page_movable_ops(struct page *page) { VM_BUG_ON(!__PageMovable(page)); @@ -132,8 +142,8 @@ const struct movable_operations *page_movable_ops(struct page *page) } #ifdef CONFIG_NUMA_BALANCING -extern int migrate_misplaced_page(struct page *page, - struct vm_area_struct *vma, int node); +int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, + int node); #else static inline int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node) diff --git a/include/linux/mm.h b/include/linux/mm.h index 716d30d93616..1f79667824eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -282,7 +282,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ +#else /* CONFIG_MMU */ +#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ +#define VM_UFFD_MISSING 0 +#endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ @@ -422,8 +427,8 @@ extern unsigned int kobjsize(const void *objp); /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE -/* This mask is used to clear all the VMA flags used by mlock */ -#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR @@ -628,6 +633,63 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) INIT_LIST_HEAD(&vma->anon_vma_chain); } +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + ACCESS_PRIVATE(vma, __vm_flags) = flags; +} + +/* Use when VMA is part of the VMA tree and modifications need coordination */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + ACCESS_PRIVATE(vma, __vm_flags) |= flags; +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; +} + +/* + * Use only if VMA is not part of the VMA tree or has no other users and + * therefore needs no locking. + */ +static inline void __vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} + +/* + * Use only when the order of set/clear operations is unimportant, otherwise + * use vm_flags_{set|clear} explicitly. + */ +static inline void vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + mmap_assert_write_locked(vma->vm_mm); + __vm_flags_mod(vma, set, clear); +} + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; @@ -671,16 +733,16 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { - return mas_find(&vmi->mas, max); + return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* - * Uses vma_find() to get the first VMA when the iterator starts. + * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ - return vma_find(vmi, ULONG_MAX); + return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) @@ -693,12 +755,50 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) return vmi->mas.index; } +static inline unsigned long vma_iter_end(struct vma_iterator *vmi) +{ + return vmi->mas.last + 1; +} +static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, + unsigned long count) +{ + return mas_expected_entries(&vmi->mas, count); +} + +/* Free any unused preallocations */ +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); +} + +static inline int vma_iter_bulk_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store(&vmi->mas, vma); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ - while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL) + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* @@ -720,11 +820,20 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; +/* + * compound_order() can be called without holding a reference, which means + * that niceties like page_folio() don't work. These callers should be + * prepared to handle wild return values. For example, PG_head may be + * set before _folio_order is initialised, or this may be a tail page. + * See compaction.c for some good examples. + */ static inline unsigned int compound_order(struct page *page) { - if (!PageHead(page)) + struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags)) return 0; - return page[1].compound_order; + return folio->_folio_order; } /** @@ -783,6 +892,13 @@ static inline bool get_page_unless_zero(struct page *page) return page_ref_add_unless(page, 1, 0); } +static inline struct folio *folio_get_nontail_page(struct page *page) +{ + if (unlikely(!get_page_unless_zero(page))) + return NULL; + return (struct folio *)page; +} + extern int page_is_ram(unsigned long pfn); enum { @@ -832,34 +948,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) static inline int folio_entire_mapcount(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - return atomic_read(folio_mapcount_ptr(folio)) + 1; -} - -/* - * Mapcount of compound page as a whole, does not include mapped sub-pages. - * Must be called only on head of compound page. - */ -static inline int head_compound_mapcount(struct page *head) -{ - return atomic_read(compound_mapcount_ptr(head)) + 1; -} - -/* - * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages, - * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit - * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently - * leaves subpages_mapcount at 0, but avoid surprise if it participates later. - */ -#define COMPOUND_MAPPED 0x800000 -#define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1) - -/* - * Number of sub-pages mapped by PTE, does not include compound mapcount. - * Must be called only on head of compound page. - */ -static inline int head_subpages_mapcount(struct page *head) -{ - return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED; + return atomic_read(&folio->_entire_mapcount) + 1; } /* @@ -872,25 +961,29 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -/* - * Mapcount of 0-order page; when compound sub-page, includes - * compound_mapcount of compound_head of page. +/** + * page_mapcount() - Number of times this precise page is mapped. + * @page: The page. + * + * The number of times this page is mapped. If this page is part of + * a large folio, it includes the number of times this page is mapped + * as part of that folio. * - * Result is undefined for pages which cannot be mapped into userspace. + * The result is undefined for pages which cannot be mapped into userspace. * For example SLAB or special types of pages. See function page_has_type(). - * They use this place in struct page differently. + * They use this field in struct page differently. */ static inline int page_mapcount(struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; - if (likely(!PageCompound(page))) - return mapcount; - page = compound_head(page); - return head_compound_mapcount(page) + mapcount; + if (unlikely(PageCompound(page))) + mapcount += folio_entire_mapcount(page_folio(page)); + + return mapcount; } -int total_compound_mapcount(struct page *head); +int folio_total_mapcount(struct folio *folio); /** * folio_mapcount() - Calculate the number of mappings of this folio. @@ -907,24 +1000,24 @@ static inline int folio_mapcount(struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) + 1; - return total_compound_mapcount(&folio->page); + return folio_total_mapcount(folio); } static inline int total_mapcount(struct page *page) { if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) + 1; - return total_compound_mapcount(compound_head(page)); + return folio_total_mapcount(page_folio(page)); } static inline bool folio_large_is_mapped(struct folio *folio) { /* - * Reading folio_mapcount_ptr() below could be omitted if hugetlb - * participated in incrementing subpages_mapcount when compound mapped. + * Reading _entire_mapcount below could be omitted if hugetlb + * participated in incrementing nr_pages_mapped when compound mapped. */ - return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 || - atomic_read(folio_mapcount_ptr(folio)) >= 0; + return atomic_read(&folio->_nr_pages_mapped) > 0 || + atomic_read(&folio->_entire_mapcount) >= 0; } /** @@ -999,8 +1092,11 @@ extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) { + struct folio *folio = (struct folio *)page; + VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page); - page[1].compound_dtor = compound_dtor; + VM_BUG_ON_PAGE(!PageHead(page), page); + folio->_folio_dtor = compound_dtor; } static inline void folio_set_compound_dtor(struct folio *folio, @@ -1012,44 +1108,13 @@ static inline void folio_set_compound_dtor(struct folio *folio, void destroy_large_folio(struct folio *folio); -static inline int head_compound_pincount(struct page *head) -{ - return atomic_read(compound_pincount_ptr(head)); -} - static inline void set_compound_order(struct page *page, unsigned int order) { - page[1].compound_order = order; -#ifdef CONFIG_64BIT - page[1].compound_nr = 1U << order; -#endif -} - -/* - * folio_set_compound_order is generally passed a non-zero order to - * initialize a large folio. However, hugetlb code abuses this by - * passing in zero when 'dissolving' a large folio. - */ -static inline void folio_set_compound_order(struct folio *folio, - unsigned int order) -{ - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + struct folio *folio = (struct folio *)page; folio->_folio_order = order; #ifdef CONFIG_64BIT - folio->_folio_nr_pages = order ? 1U << order : 0; -#endif -} - -/* Returns the number of pages in this potentially compound page. */ -static inline unsigned long compound_nr(struct page *page) -{ - if (!PageHead(page)) - return 1; -#ifdef CONFIG_64BIT - return page[1].compound_nr; -#else - return 1UL << compound_order(page); + folio->_folio_nr_pages = 1U << order; #endif } @@ -1076,16 +1141,6 @@ static inline unsigned int thp_order(struct page *page) } /** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline int thp_nr_pages(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return compound_nr(page); -} - -/** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * @@ -1226,8 +1281,6 @@ static inline void get_page(struct page *page) folio_get(page_folio(page)); } -int __must_check try_grab_page(struct page *page, unsigned int flags); - static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); @@ -1369,6 +1422,21 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +#ifndef CONFIG_MMU +static inline bool is_nommu_shared_mapping(vm_flags_t flags) +{ + /* + * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected + * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of + * a file mapping. R/O MAP_PRIVATE mappings might still modify + * underlying memory if ptrace is active, so this is only possible if + * ptrace does not apply. Note that there is no mprotect() to upgrade + * write permissions later. + */ + return flags & (VM_MAYSHARE | VM_MAYOVERLAY); +} +#endif + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif @@ -1643,11 +1711,6 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } -static inline atomic_t *folio_pincount_ptr(struct folio *folio) -{ - return &folio_page(folio, 1)->compound_pincount; -} - /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. @@ -1665,7 +1728,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) * expected to be able to deal gracefully with a false positive. * * For large folios, the result will be exactly correct. That's because - * we have more tracking data available: the compound_pincount is used + * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. @@ -1676,7 +1739,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_test_large(folio)) - return atomic_read(folio_pincount_ptr(folio)) > 0; + return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then @@ -1784,6 +1847,33 @@ static inline long folio_nr_pages(struct folio *folio) #endif } +/* + * compound_nr() returns the number of pages in this potentially compound + * page. compound_nr() can be called on a tail page, and is defined to + * return 1 in that case. + */ +static inline unsigned long compound_nr(struct page *page) +{ + struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags)) + return 1; +#ifdef CONFIG_64BIT + return folio->_folio_nr_pages; +#else + return 1L << folio->_folio_order; +#endif +} + +/** + * thp_nr_pages - The number of regular pages in this huge page. + * @page: The head page of a huge page. + */ +static inline int thp_nr_pages(struct page *page) +{ + return folio_nr_pages((struct folio *)page); +} + /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. @@ -1833,6 +1923,24 @@ static inline size_t folio_size(struct folio *folio) return PAGE_SIZE << folio_order(folio); } +/** + * folio_estimated_sharers - Estimate the number of sharers of a folio. + * @folio: The folio. + * + * folio_estimated_sharers() aims to serve as a function to efficiently + * estimate the number of processes sharing a folio. This is done by + * looking at the precise mapcount of the first subpage in the folio, and + * assuming the other subpages are the same. This may not be true for large + * folios. If you want exact mapcounts for exact calculations, look at + * page_mapcount() or folio_total_mapcount(). + * + * Return: The estimated number of processes sharing a folio. + */ +static inline int folio_estimated_sharers(struct folio *folio) +{ + return page_mapcount(folio_page(folio, 0)); +} + #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE static inline int arch_make_page_accessible(struct page *page) { @@ -1929,6 +2037,21 @@ static inline bool page_is_pfmemalloc(const struct page *page) } /* + * Return true only if the folio has been allocated with + * ALLOC_NO_WATERMARKS and the low watermark was not + * met implying that the system is under some pressure. + */ +static inline bool folio_is_pfmemalloc(const struct folio *folio) +{ + /* + * lru.next has bit 1 set if the page is allocated from the + * pfmemalloc reserves. Callers may simply overwrite it if + * they do not need to preserve that information. + */ + return (uintptr_t)folio->lru.next & BIT(1); +} + +/* * Only to be called by the page allocator on a freshly allocated * page. */ @@ -2015,6 +2138,8 @@ static inline bool can_do_mlock(void) { return false; } extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); +struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -2022,13 +2147,16 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details); +static inline void zap_vma_pages(struct vm_area_struct *vma) +{ + zap_page_range_single(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); +} void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end); + unsigned long end, bool mm_wr_locked); struct mmu_notifier_range; @@ -2175,21 +2303,18 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma } bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); -extern unsigned long change_protection(struct mmu_gather *tlb, +extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgprot_t newprot, - unsigned long cp_flags); -extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags); + unsigned long end, unsigned long cp_flags); +extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); -int pin_user_pages_fast_only(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages); static inline bool get_user_page_fast_only(unsigned long addr, unsigned int gup_flags, struct page **pagep) @@ -2813,23 +2938,21 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand); -static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) -{ - return __vma_adjust(vma, start, end, pgoff, insert, NULL); -} -extern struct vm_area_struct *vma_merge(struct mm_struct *, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); +extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next); +extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); +extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, + struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, struct anon_vma *, + struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, + struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); +extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); +extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, @@ -2837,9 +2960,6 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); - static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, @@ -2887,7 +3007,7 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, +extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, @@ -2895,6 +3015,9 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t, extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU +extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf, bool downgrade); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) @@ -3100,81 +3223,6 @@ static inline vm_fault_t vmf_error(int err) struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags); -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ -#define FOLL_GET 0x04 /* do get_page on page */ -#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ -#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ -#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO - * and return without waiting upon it */ -#define FOLL_NOFAULT 0x80 /* do not fault in pages */ -#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_ANON 0x8000 /* don't do file mappings */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ -#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ -#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ -#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ -#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ -#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ - -/* - * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each - * other. Here is what they mean, and how to use them: - * - * FOLL_LONGTERM indicates that the page will be held for an indefinite time - * period _often_ under userspace control. This is in contrast to - * iov_iter_get_pages(), whose usages are transient. - * - * FIXME: For pages which are part of a filesystem, mappings are subject to the - * lifetime enforced by the filesystem and we need guarantees that longterm - * users like RDMA and V4L2 only establish mappings which coordinate usage with - * the filesystem. Ideas for this coordination include revoking the longterm - * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was - * added after the problem with filesystems was found FS DAX VMAs are - * specifically failed. Filesystem pages are still subject to bugs and use of - * FOLL_LONGTERM should be avoided on those pages. - * - * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. - * Currently only get_user_pages() and get_user_pages_fast() support this flag - * and calls to get_user_pages_[un]locked are specifically not allowed. This - * is due to an incompatibility with the FS DAX check and - * FAULT_FLAG_ALLOW_RETRY. - * - * In the CMA case: long term pins in a CMA region would unnecessarily fragment - * that region. And so, CMA attempts to migrate the page before pinning, when - * FOLL_LONGTERM is specified. - * - * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, - * but an additional pin counting system) will be invoked. This is intended for - * anything that gets a page reference and then touches page data (for example, - * Direct IO). This lets the filesystem know that some non-file-system entity is - * potentially changing the pages' data. In contrast to FOLL_GET (whose pages - * are released via put_page()), FOLL_PIN pages must be released, ultimately, by - * a call to unpin_user_page(). - * - * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different - * and separate refcounting mechanisms, however, and that means that each has - * its own acquire and release mechanisms: - * - * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. - * - * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. - * - * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. - * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based - * calls applied to them, and that's perfectly OK. This is a constraint on the - * callers, not on the pages.) - * - * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never - * directly by the caller. That's in order to help avoid mismatches when - * releasing pages: get_user_pages*() pages must be released via put_page(), - * while pin_user_pages*() pages must be released via unpin_user_page(). - * - * Please see Documentation/core-api/pin_user_pages.rst for more information. - */ - static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) @@ -3187,71 +3235,6 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) } /* - * Indicates for which pages that are write-protected in the page table, - * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the - * GUP pin will remain consistent with the pages mapped into the page tables - * of the MM. - * - * Temporary unmapping of PageAnonExclusive() pages or clearing of - * PageAnonExclusive() has to protect against concurrent GUP: - * * Ordinary GUP: Using the PT lock - * * GUP-fast and fork(): mm->write_protect_seq - * * GUP-fast and KSM or temporary unmapping (swap, migration): see - * page_try_share_anon_rmap() - * - * Must be called with the (sub)page that's actually referenced via the - * page table entry, which might not necessarily be the head page for a - * PTE-mapped THP. - * - * If the vma is NULL, we're coming from the GUP-fast path and might have - * to fallback to the slow path just to lookup the vma. - */ -static inline bool gup_must_unshare(struct vm_area_struct *vma, - unsigned int flags, struct page *page) -{ - /* - * FOLL_WRITE is implicitly handled correctly as the page table entry - * has to be writable -- and if it references (part of) an anonymous - * folio, that part is required to be marked exclusive. - */ - if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) - return false; - /* - * Note: PageAnon(page) is stable until the page is actually getting - * freed. - */ - if (!PageAnon(page)) { - /* - * We only care about R/O long-term pining: R/O short-term - * pinning does not have the semantics to observe successive - * changes through the process page tables. - */ - if (!(flags & FOLL_LONGTERM)) - return false; - - /* We really need the vma ... */ - if (!vma) - return true; - - /* - * ... because we only care about writable private ("COW") - * mappings where we have to break COW early. - */ - return is_cow_mapping(vma->vm_flags); - } - - /* Paired with a memory barrier in page_try_share_anon_rmap(). */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) - smp_rmb(); - - /* - * Note that PageKsm() pages cannot be exclusive, and consequently, - * cannot get pinned. - */ - return !PageAnonExclusive(page); -} - -/* * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ @@ -3550,6 +3533,11 @@ enum mf_action_page_type { MF_MSG_UNKNOWN, }; +/* + * Sysfs entries for memory failure handling statistics. + */ +extern const struct attribute_group memory_failure_attr_group; + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr_hint, @@ -3667,7 +3655,7 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) * VM_MAYWRITE as we still want them to be COW-writable. */ if (vma->vm_flags & VM_SHARED) - vma->vm_flags &= ~(VM_MAYWRITE); + vm_flags_clear(vma, VM_MAYWRITE); } return 0; diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ff3f3f23f649..de1e622dd366 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); @@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); @@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) - list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else - list_add(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } @@ -577,4 +577,15 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, #endif } +static inline bool vma_has_recency(struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) + return false; + + if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) + return false; + + return true; +} + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index af8119776ab1..0722859c3647 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -140,30 +140,6 @@ struct page { }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - atomic_t compound_mapcount; - atomic_t subpages_mapcount; - atomic_t compound_pincount; -#ifdef CONFIG_64BIT - unsigned int compound_nr; /* 1 << compound_order */ -#endif - }; - struct { /* Second tail page of transparent huge page */ - unsigned long _compound_pad_1; /* compound_head */ - unsigned long _compound_pad_2; - /* For both global and memcg */ - struct list_head deferred_list; - }; - struct { /* Second tail page of hugetlb page */ - unsigned long _hugetlb_pad_1; /* compound_head */ - void *hugetlb_subpool; - void *hugetlb_cgroup; - void *hugetlb_cgroup_rsvd; - void *hugetlb_hwpoison; - /* No more space on 32-bit: use third tail if more */ }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ @@ -302,20 +278,17 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. - * @_flags_1: For large folios, additional page flags. - * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). - * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). - * @_subpages_mapcount: Do not use directly, call folio_mapcount(). + * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). - * @_flags_2: For alignment. Do not use. - * @_head_2: Points to the folio. Do not use. * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). + * @_deferred_list: Folios to be split under memory pressure. * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -358,14 +331,16 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; + /* public: */ unsigned char _folio_dtor; unsigned char _folio_order; - atomic_t _compound_mapcount; - atomic_t _subpages_mapcount; + atomic_t _entire_mapcount; + atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; #endif + /* private: the union with struct page is transitional */ }; struct page __page_1; }; @@ -373,10 +348,19 @@ struct folio { struct { unsigned long _flags_2; unsigned long _head_2; + /* public: */ void *_hugetlb_subpool; void *_hugetlb_cgroup; void *_hugetlb_cgroup_rsvd; void *_hugetlb_hwpoison; + /* private: the union with struct page is transitional */ + }; + struct { + unsigned long _flags_2a; + unsigned long _head_2a; + /* public: */ + struct list_head _deferred_list; + /* private: the union with struct page is transitional */ }; struct page __page_2; }; @@ -401,53 +385,14 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); -FOLIO_MATCH(compound_dtor, _folio_dtor); -FOLIO_MATCH(compound_order, _folio_order); -FOLIO_MATCH(compound_mapcount, _compound_mapcount); -FOLIO_MATCH(subpages_mapcount, _subpages_mapcount); -FOLIO_MATCH(compound_pincount, _pincount); -#ifdef CONFIG_64BIT -FOLIO_MATCH(compound_nr, _folio_nr_pages); -#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); -FOLIO_MATCH(hugetlb_subpool, _hugetlb_subpool); -FOLIO_MATCH(hugetlb_cgroup, _hugetlb_cgroup); -FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd); -FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison); #undef FOLIO_MATCH -static inline atomic_t *folio_mapcount_ptr(struct folio *folio) -{ - struct page *tail = &folio->page + 1; - return &tail->compound_mapcount; -} - -static inline atomic_t *folio_subpages_mapcount_ptr(struct folio *folio) -{ - struct page *tail = &folio->page + 1; - return &tail->subpages_mapcount; -} - -static inline atomic_t *compound_mapcount_ptr(struct page *page) -{ - return &page[1].compound_mapcount; -} - -static inline atomic_t *subpages_mapcount_ptr(struct page *page) -{ - return &page[1].subpages_mapcount; -} - -static inline atomic_t *compound_pincount_ptr(struct page *page) -{ - return &page[1].compound_pincount; -} - /* * Used for sizing the vmemmap region on some architectures */ @@ -546,7 +491,15 @@ struct vm_area_struct { * See vmf_insert_mixed_prot() for discussion. */ pgprot_t vm_page_prot; - unsigned long vm_flags; /* Flags, see mm.h. */ + + /* + * Flags, see mm.h. + * To modify use vm_flags_{init|reset|set|clear|mod} functions. + */ + union { + const vm_flags_t vm_flags; + vm_flags_t __private __vm_flags; + }; /* * For areas with an address space and backing store, @@ -658,7 +611,7 @@ struct mm_struct { raw_spinlock_t cid_lock; #endif #ifdef CONFIG_MMU - atomic_long_t pgtables_bytes; /* PTE page table pages */ + atomic_long_t pgtables_bytes; /* size of all page tables */ #endif int map_count; /* number of VMAs */ @@ -915,9 +868,7 @@ struct vma_iterator { static inline void vma_iter_init(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long addr) { - vmi->mas.tree = &mm->mm_mt; - vmi->mas.index = addr; - vmi->mas.node = MAS_START; + mas_init(&vmi->mas, &mm->mm_mt, addr); } #ifdef CONFIG_SCHED_MM_CID @@ -1126,4 +1077,87 @@ enum fault_flag { typedef unsigned int __bitwise zap_flags_t; +/* + * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each + * other. Here is what they mean, and how to use them: + * + * + * FIXME: For pages which are part of a filesystem, mappings are subject to the + * lifetime enforced by the filesystem and we need guarantees that longterm + * users like RDMA and V4L2 only establish mappings which coordinate usage with + * the filesystem. Ideas for this coordination include revoking the longterm + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was + * added after the problem with filesystems was found FS DAX VMAs are + * specifically failed. Filesystem pages are still subject to bugs and use of + * FOLL_LONGTERM should be avoided on those pages. + * + * In the CMA case: long term pins in a CMA region would unnecessarily fragment + * that region. And so, CMA attempts to migrate the page before pinning, when + * FOLL_LONGTERM is specified. + * + * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, + * but an additional pin counting system) will be invoked. This is intended for + * anything that gets a page reference and then touches page data (for example, + * Direct IO). This lets the filesystem know that some non-file-system entity is + * potentially changing the pages' data. In contrast to FOLL_GET (whose pages + * are released via put_page()), FOLL_PIN pages must be released, ultimately, by + * a call to unpin_user_page(). + * + * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different + * and separate refcounting mechanisms, however, and that means that each has + * its own acquire and release mechanisms: + * + * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. + * + * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. + * + * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. + * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based + * calls applied to them, and that's perfectly OK. This is a constraint on the + * callers, not on the pages.) + * + * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never + * directly by the caller. That's in order to help avoid mismatches when + * releasing pages: get_user_pages*() pages must be released via put_page(), + * while pin_user_pages*() pages must be released via unpin_user_page(). + * + * Please see Documentation/core-api/pin_user_pages.rst for more information. + */ + +enum { + /* check pte is writable */ + FOLL_WRITE = 1 << 0, + /* do get_page on page */ + FOLL_GET = 1 << 1, + /* give error on hole if it would be zero */ + FOLL_DUMP = 1 << 2, + /* get_user_pages read/write w/o permission */ + FOLL_FORCE = 1 << 3, + /* + * if a disk transfer is needed, start the IO and return without waiting + * upon it + */ + FOLL_NOWAIT = 1 << 4, + /* do not fault in pages */ + FOLL_NOFAULT = 1 << 5, + /* check page is hwpoisoned */ + FOLL_HWPOISON = 1 << 6, + /* don't do file mappings */ + FOLL_ANON = 1 << 7, + /* + * FOLL_LONGTERM indicates that the page will be held for an indefinite + * time period _often_ under userspace control. This is in contrast to + * iov_iter_get_pages(), whose usages are transient. + */ + FOLL_LONGTERM = 1 << 8, + /* split huge pmd before returning */ + FOLL_SPLIT_PMD = 1 << 9, + /* allow returning PCI P2PDMA pages */ + FOLL_PCI_P2PDMA = 1 << 10, + /* allow interrupts from generic signals */ + FOLL_INTERRUPTIBLE = 1 << 11, + + /* See also internal only FOLL flags in mm/internal.h */ +}; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/mman.h b/include/linux/mman.h index 58b3abd457a3..cee1e4b566d8 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -156,4 +156,38 @@ calc_vm_flag_bits(unsigned long flags) } unsigned long vm_commit_limit(void); + +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + */ +static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags) +{ + if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return false; + + if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE)) + return true; + + if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC)) + return true; + + return false; +} + #endif /* _LINUX_MMAN_H */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d6c06e140277..64a3e051c3c4 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -269,7 +269,6 @@ extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { - struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; @@ -514,12 +513,10 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, - struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) { - range->vma = vma; range->event = event; range->mm = mm; range->start = start; @@ -530,10 +527,10 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, - struct vm_area_struct *vma, struct mm_struct *mm, - unsigned long start, unsigned long end, void *owner) + struct mm_struct *mm, unsigned long start, + unsigned long end, void *owner) { - mmu_notifier_range_init(range, event, flags, vma, mm, start, end); + mmu_notifier_range_init(range, event, flags, mm, start, end); range->owner = owner; } @@ -659,9 +656,9 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, range->end = end; } -#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ +#define mmu_notifier_range_init(range,event,flags,mm,start,end) \ _mmu_notifier_range_init(range, start, end) -#define mmu_notifier_range_init_owner(range, event, flags, vma, mm, start, \ +#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd28a100d9e4..9fb1b03b83b2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -7,6 +7,7 @@ #include <linux/spinlock.h> #include <linux/list.h> +#include <linux/list_nulls.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/cache.h> @@ -312,7 +313,7 @@ enum lruvec_flags { * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->lists[]. Otherwise it stores 0. + * a page is on one of lrugen->folios[]. Otherwise it stores 0. * * A page is added to the youngest generation on faulting. The aging needs to * check the accessed bit at least twice before handing this page over to the @@ -324,8 +325,8 @@ enum lruvec_flags { * rest of generations, if they exist, are considered inactive. See * lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->lists[] so that - * the aging needs not to worry about it. And it's set again when a page + * PG_active is always cleared while a page is on one of lrugen->folios[] so + * that the aging needs not to worry about it. And it's set again when a page * considered active is isolated for non-reclaiming purposes, e.g., migration. * See lru_gen_add_folio() and lru_gen_del_folio(). * @@ -404,7 +405,7 @@ enum { * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ -struct lru_gen_struct { +struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ @@ -412,7 +413,7 @@ struct lru_gen_struct { /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ @@ -426,6 +427,14 @@ struct lru_gen_struct { atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; +#ifdef CONFIG_MEMCG + /* the memcg generation this lru_gen_folio belongs to */ + u8 gen; + /* the list segment this lru_gen_folio belongs to */ + u8 seg; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_node list; +#endif }; enum { @@ -461,7 +470,7 @@ struct lru_gen_mm_state { struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; - /* unstable max_seq from lru_gen_struct */ + /* unstable max_seq from lru_gen_folio */ unsigned long max_seq; /* the next address within an mm to scan */ unsigned long next_addr; @@ -479,12 +488,87 @@ void lru_gen_init_lruvec(struct lruvec *lruvec); void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); #ifdef CONFIG_MEMCG + +/* + * For each node, memcgs are divided into two generations: the old and the + * young. For each generation, memcgs are randomly sharded into multiple bins + * to improve scalability. For each bin, the hlist_nulls is virtually divided + * into three segments: the head, the tail and the default. + * + * An onlining memcg is added to the tail of a random bin in the old generation. + * The eviction starts at the head of a random bin in the old generation. The + * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes + * the old generation, is incremented when all its bins become empty. + * + * There are four operations: + * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its + * current generation (old or young) and updates its "seg" to "head"; + * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its + * current generation (old or young) and updates its "seg" to "tail"; + * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old + * generation, updates its "gen" to "old" and resets its "seg" to "default"; + * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the + * young generation, updates its "gen" to "young" and resets its "seg" to + * "default". + * + * The events that trigger the above operations are: + * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; + * 2. The first attempt to reclaim an memcg below low, which triggers + * MEMCG_LRU_TAIL; + * 3. The first attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_TAIL; + * 4. The second attempt to reclaim an memcg below reclaimable size threshold, + * which triggers MEMCG_LRU_YOUNG; + * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; + * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; + * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. + * + * Note that memcg LRU only applies to global reclaim, and the round-robin + * incrementing of their max_seq counters ensures the eventual fairness to all + * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). + */ +#define MEMCG_NR_GENS 2 +#define MEMCG_NR_BINS 8 + +struct lru_gen_memcg { + /* the per-node memcg generation counter */ + unsigned long seq; + /* each memcg has one lru_gen_folio per node */ + unsigned long nr_memcgs[MEMCG_NR_GENS]; + /* per-node lru_gen_folio list for global reclaim */ + struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS]; + /* protects the above */ + spinlock_t lock; +}; + +void lru_gen_init_pgdat(struct pglist_data *pgdat); + void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); -#endif +void lru_gen_online_memcg(struct mem_cgroup *memcg); +void lru_gen_offline_memcg(struct mem_cgroup *memcg); +void lru_gen_release_memcg(struct mem_cgroup *memcg); +void lru_gen_soft_reclaim(struct lruvec *lruvec); + +#else /* !CONFIG_MEMCG */ + +#define MEMCG_NR_GENS 1 + +struct lru_gen_memcg { +}; + +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + +#endif /* CONFIG_MEMCG */ #else /* !CONFIG_LRU_GEN */ +static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) +{ +} + static inline void lru_gen_init_lruvec(struct lruvec *lruvec) { } @@ -494,6 +578,7 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } #ifdef CONFIG_MEMCG + static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } @@ -501,7 +586,24 @@ static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg) { } -#endif + +static inline void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_soft_reclaim(struct lruvec *lruvec) +{ +} + +#endif /* CONFIG_MEMCG */ #endif /* CONFIG_LRU_GEN */ @@ -524,7 +626,7 @@ struct lruvec { unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ - struct lru_gen_struct lrugen; + struct lru_gen_folio lrugen; /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif @@ -1110,6 +1212,31 @@ struct deferred_split { }; #endif +#ifdef CONFIG_MEMORY_FAILURE +/* + * Per NUMA node memory failure handling statistics. + */ +struct memory_failure_stats { + /* + * Number of raw pages poisoned. + * Cases not accounted: memory outside kernel control, offline page, + * arch-specific memory_failure (SGX), hwpoison_filter() filtered + * error events, and unpoison actions from hwpoison_unpoison. + */ + unsigned long total; + /* + * Recovery results of poisoned raw pages handled by memory_failure, + * in sync with mf_result. + * total = ignored + failed + delayed + recovered. + * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. + */ + unsigned long ignored; + unsigned long failed; + unsigned long delayed; + unsigned long recovered; +}; +#endif + /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which @@ -1243,6 +1370,8 @@ typedef struct pglist_data { #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; + /* lru_gen_folio list */ + struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); @@ -1253,6 +1382,9 @@ typedef struct pglist_data { #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif +#ifdef CONFIG_MEMORY_FAILURE + struct memory_failure_stats mf_stats; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 69e93a0c1277..a7e3a3405520 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -531,6 +531,7 @@ PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) +#define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f)) #else PAGEFLAG_FALSE(HighMem, highmem) #endif diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 22be4582faae..bc2e39090a1f 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -7,15 +7,35 @@ #include <linux/stackdepot.h> struct pglist_data; + +/** + * struct page_ext_operations - per page_ext client operations + * @offset: Offset to the client's data within page_ext. Offset is returned to + * the client by page_ext_init. + * @size: The size of the client data within page_ext. + * @need: Function that returns true if client requires page_ext. + * @init: (optional) Called to initialize client once page_exts are allocated. + * @need_shared_flags: True when client is using shared page_ext->flags + * field. + * + * Each Page Extension client must define page_ext_operations in + * page_ext_ops array. + */ struct page_ext_operations { size_t offset; size_t size; bool (*need)(void); void (*init)(void); + bool need_shared_flags; }; +extern bool deferred_struct_pages; + #ifdef CONFIG_PAGE_EXTENSION +/* + * The page_ext_flags users must set need_shared_flags to true. + */ enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2f5b36f446cc..0acb8e1fb7af 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -547,6 +547,26 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping, } /** + * filemap_grab_folio - grab a folio from the page cache + * @mapping: The address space to search + * @index: The page index + * + * Looks up the page cache entry at @mapping & @index. If no folio is found, + * a new folio is created. The folio is locked, marked as accessed, and + * returned. + * + * Return: A found or created folio. NULL if no folio is found and failed to + * create a folio. + */ +static inline struct folio *filemap_grab_folio(struct address_space *mapping, + pgoff_t index) +{ + return __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(mapping)); +} + +/** * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index @@ -719,16 +739,8 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); -unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, xa_mark_t tag, unsigned int nr_pages, - struct page **pages); -static inline unsigned find_get_pages_tag(struct address_space *mapping, - pgoff_t *index, xa_mark_t tag, unsigned int nr_pages, - struct page **pages) -{ - return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, - nr_pages, pages); -} +unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, + pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index); @@ -744,6 +756,8 @@ static inline struct page *grab_cache_page(struct address_space *mapping, struct folio *read_cache_folio(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); +struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index, + gfp_t flags); struct page *read_cache_page(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); extern struct page * read_cache_page_gfp(struct address_space *mapping, diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 215eb6c3bdc9..f582f7213ea5 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -26,14 +26,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -unsigned pagevec_lookup_range_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, pgoff_t end, - xa_mark_t tag); -static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, xa_mark_t tag) -{ - return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); -} static inline void pagevec_init(struct pagevec *pvec) { @@ -103,6 +95,11 @@ static inline void folio_batch_init(struct folio_batch *fbatch) fbatch->percpu_pvec_drained = false; } +static inline void folio_batch_reinit(struct folio_batch *fbatch) +{ + fbatch->nr = 0; +} + static inline unsigned int folio_batch_count(struct folio_batch *fbatch) { return fbatch->nr; diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 959f52e5867d..27a6df448ee5 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -21,7 +21,16 @@ struct mm_walk; * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. * Any folded depths (where PTRS_PER_P?D is equal to 1) * are skipped. - * @hugetlb_entry: if set, called for each hugetlb entry + * @hugetlb_entry: if set, called for each hugetlb entry. This hook + * function is called with the vma lock held, in order to + * protect against a concurrent freeing of the pte_t* or + * the ptl. In some cases, the hook function needs to drop + * and retake the vma lock in order to avoid deadlocks + * while calling other functions. In such cases the hook + * function must either refrain from accessing the pte or + * ptl after dropping the vma lock, or else revalidate + * those items after re-acquiring the vma lock and before + * accessing them. * @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means * "do page table walk over the current vma", returning diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 1159b25b0542..c63cd44777ec 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1064,35 +1064,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif -/* - * When replacing an anonymous page by a real (!non) swap entry, we clear - * PG_anon_exclusive from the page and instead remember whether the flag was - * set in the swp pte. During fork(), we have to mark the entry as !exclusive - * (possibly shared). On swapin, we use that information to restore - * PG_anon_exclusive, which is very helpful in cases where we might have - * additional (e.g., FOLL_GET) references on a page and wouldn't be able to - * detect exclusivity. - * - * These functions don't apply to non-swap entries (e.g., migration, hwpoison, - * ...). - */ -#ifndef __HAVE_ARCH_PTE_SWP_EXCLUSIVE -static inline pte_t pte_swp_mkexclusive(pte_t pte) -{ - return pte; -} - -static inline int pte_swp_exclusive(pte_t pte) -{ - return false; -} - -static inline pte_t pte_swp_clear_exclusive(pte_t pte) -{ - return pte; -} -#endif - #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) @@ -1214,7 +1185,8 @@ static inline int track_pfn_copy(struct vm_area_struct *vma) * can be for the entire vma (in which case pfn, size are zero). */ static inline void untrack_pfn(struct vm_area_struct *vma, - unsigned long pfn, unsigned long size) + unsigned long pfn, unsigned long size, + bool mm_wr_locked) { } @@ -1232,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); + unsigned long size, bool mm_wr_locked); extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 07481bb87d4e..c758809d5bcf 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -16,6 +16,21 @@ struct fs_pin; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +/* + * sysctl for vm.memfd_noexec + * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL + * acts like MFD_EXEC was set. + * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL + * acts like MFD_NOEXEC_SEAL was set. + * 2: memfd_create() without MFD_NOEXEC_SEAL will be + * rejected. + */ +#define MEMFD_NOEXEC_SCOPE_EXEC 0 +#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 +#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 +#endif + struct pid_namespace { struct idr idr; struct rcu_head rcu; @@ -31,6 +46,10 @@ struct pid_namespace { struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + /* sysctl for vm.memfd_noexec */ + int memfd_noexec_scope; +#endif } __randomize_layout; extern struct pid_namespace init_pid_ns; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bd3504d11b15..a4570da03e58 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -194,6 +194,8 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); +void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, + unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); void page_remove_rmap(struct page *, struct vm_area_struct *, @@ -201,12 +203,19 @@ void page_remove_rmap(struct page *, struct vm_area_struct *, void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, +void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); static inline void __page_dup_rmap(struct page *page, bool compound) { - atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); + if (compound) { + struct folio *folio = (struct folio *)page; + + VM_BUG_ON_PAGE(compound && !PageHead(page), page); + atomic_inc(&folio->_entire_mapcount); + } else { + atomic_inc(&page->_mapcount); + } } static inline void page_dup_file_rmap(struct page *page, bool compound) diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 8270ad7ae14c..0e17ae7fbfd3 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm) * lifecycle of this mm, just for simplicity. */ #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ + +#define MMF_HAS_MDWE 28 +#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) + #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK) + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d500ea967dc7..103d1000a5a2 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -92,14 +92,8 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); -extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force); -static inline bool shmem_huge_enabled(struct vm_area_struct *vma, - bool shmem_huge_force) -{ - return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff, - shmem_huge_force); -} +extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -115,6 +109,14 @@ enum sgp_type { int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp); +struct folio *shmem_read_folio_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp); + +static inline struct folio *shmem_read_folio(struct address_space *mapping, + pgoff_t index) +{ + return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping)); +} static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 5834bad8ad78..a61e7d55d0d3 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -72,7 +72,7 @@ struct kmem_cache { int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC struct kasan_cache kasan_info; #endif diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index aa0ee1678d29..f6df03f934e5 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -136,7 +136,7 @@ struct kmem_cache { unsigned int *random_seq; #endif -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC struct kasan_cache kasan_info; #endif diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 9ca7798d7a31..e58306783d8e 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -1,11 +1,22 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * A generic stack depot implementation + * Stack depot - a stack trace storage that avoids duplication. + * + * Stack depot is intended to be used by subsystems that need to store and + * later retrieve many potentially duplicated stack traces without wasting + * memory. + * + * For example, KASAN needs to save allocation and free stack traces for each + * object. Storing two stack traces per object requires a lot of memory (e.g. + * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free + * stack traces often repeat, using stack depot allows to save about 100x space. + * + * Stack traces are never removed from the stack depot. * * Author: Alexander Potapenko <glider@google.com> * Copyright (C) 2016 Google, Inc. * - * Based on code by Dmitry Chernenkov. + * Based on the code by Dmitry Chernenkov. */ #ifndef _LINUX_STACKDEPOT_H @@ -14,62 +25,143 @@ #include <linux/gfp.h> typedef u32 depot_stack_handle_t; + /* * Number of bits in the handle that stack depot doesn't use. Users may store - * information in them. + * information in them via stack_depot_set/get_extra_bits. */ #define STACK_DEPOT_EXTRA_BITS 5 -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - unsigned int extra_bits, - gfp_t gfp_flags, bool can_alloc); - /* - * Every user of stack depot has to call stack_depot_init() during its own init - * when it's decided that it will be calling stack_depot_save() later. This is - * recommended for e.g. modules initialized later in the boot process, when - * slab_is_available() is true. - * - * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot - * enabled as part of mm_init(), for subsystems where it's known at compile time - * that stack depot will be used. - * - * Another alternative is to call stack_depot_want_early_init(), when the - * decision to use stack depot is taken e.g. when evaluating kernel boot - * parameters, which precedes the enablement point in mm_init(). - * - * stack_depot_init() and stack_depot_want_early_init() can be called regardless - * of CONFIG_STACKDEPOT and are no-op when disabled. The actual save/fetch/print - * functions should only be called from code that makes sure CONFIG_STACKDEPOT - * is enabled. + * Using stack depot requires its initialization, which can be done in 3 ways: + * + * 1. Selecting CONFIG_STACKDEPOT_ALWAYS_INIT. This option is suitable in + * scenarios where it's known at compile time that stack depot will be used. + * Enabling this config makes the kernel initialize stack depot in mm_init(). + * + * 2. Calling stack_depot_request_early_init() during early boot, before + * stack_depot_early_init() in mm_init() completes. For example, this can + * be done when evaluating kernel boot parameters. + * + * 3. Calling stack_depot_init(). Possible after boot is complete. This option + * is recommended for modules initialized later in the boot process, after + * mm_init() completes. + * + * stack_depot_init() and stack_depot_request_early_init() can be called + * regardless of whether CONFIG_STACKDEPOT is enabled and are no-op when this + * config is disabled. The save/fetch/print stack depot functions can only be + * called from the code that makes sure CONFIG_STACKDEPOT is enabled _and_ + * initializes stack depot via one of the ways listed above. */ #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); -void __init stack_depot_want_early_init(void); +void __init stack_depot_request_early_init(void); -/* This is supposed to be called only from mm_init() */ +/* Must be only called from mm_init(). */ int __init stack_depot_early_init(void); #else static inline int stack_depot_init(void) { return 0; } -static inline void stack_depot_want_early_init(void) { } +static inline void stack_depot_request_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif +/** + * __stack_depot_save - Save a stack trace to stack depot + * + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags + * @can_alloc: Allocate stack pools (increased chance of failure if false) + * + * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is + * %true, stack depot can replenish the stack pools in case no space is left + * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids + * any allocations and fails if no space is left to store the stack trace. + * + * If the provided stack trace comes from the interrupt context, only the part + * up to the interrupt entry is saved. + * + * Context: Any context, but setting @can_alloc to %false is required if + * alloc_pages() cannot be used from the current context. Currently + * this is the case for contexts where neither %GFP_ATOMIC nor + * %GFP_NOWAIT can be used (NMI, raw_spin_lock). + * + * Return: Handle of the stack struct stored in depot, 0 on failure + */ +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t gfp_flags, bool can_alloc); + +/** + * stack_depot_save - Save a stack trace to stack depot + * + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags + * + * Context: Contexts where allocations via alloc_pages() are allowed. + * See __stack_depot_save() for more details. + * + * Return: Handle of the stack trace stored in depot, 0 on failure + */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); +/** + * stack_depot_fetch - Fetch a stack trace from stack depot + * + * @handle: Stack depot handle returned from stack_depot_save() + * @entries: Pointer to store the address of the stack trace + * + * Return: Number of frames for the fetched stack + */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); -unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); +/** + * stack_depot_print - Print a stack trace from stack depot + * + * @stack: Stack depot handle returned from stack_depot_save() + */ +void stack_depot_print(depot_stack_handle_t stack); +/** + * stack_depot_snprint - Print a stack trace from stack depot into a buffer + * + * @handle: Stack depot handle returned from stack_depot_save() + * @buf: Pointer to the print buffer + * @size: Size of the print buffer + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed + */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); -void stack_depot_print(depot_stack_handle_t stack); +/** + * stack_depot_set_extra_bits - Set extra bits in a stack depot handle + * + * @handle: Stack depot handle returned from stack_depot_save() + * @extra_bits: Value to set the extra bits + * + * Return: Stack depot handle with extra bits set + * + * Stack depot handles have a few unused bits, which can be used for storing + * user-specific information. These bits are transparent to the stack depot. + */ +depot_stack_handle_t __must_check stack_depot_set_extra_bits( + depot_stack_handle_t handle, unsigned int extra_bits); + +/** + * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle + * + * @handle: Stack depot handle with extra bits saved + * + * Return: Extra bits retrieved from the stack depot handle + */ +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/include/linux/string.h b/include/linux/string.h index db28802ab0a6..c062c581a98b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -177,6 +177,7 @@ extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); +extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ceed49516ad..209a425739a9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -401,8 +401,8 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); -extern void deactivate_page(struct page *page); -extern void mark_page_lazyfree(struct page *page); +void folio_deactivate(struct folio *folio); +void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); extern void lru_cache_add_inactive_or_unevictable(struct page *page, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index b982dd614572..3a451b7afcb3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); #ifdef CONFIG_HUGETLB_PAGE -extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); +extern void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); #endif /* CONFIG_HUGETLB_PAGE */ #else /* CONFIG_MIGRATION */ @@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } #ifdef CONFIG_HUGETLB_PAGE -static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } +static inline void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } #endif /* CONFIG_HUGETLB_PAGE */ static inline int is_writable_migration_entry(swp_entry_t entry) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 9df0b9a762cc..3767f18114ef 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -73,7 +73,7 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); -extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, +extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 096d48aa3437..69250efa03d1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -76,6 +76,7 @@ struct vmap_area { unsigned long subtree_max_size; /* in "free" tree */ struct vm_struct *vm; /* in "busy" tree */ }; + unsigned long flags; /* mark type of vm_map_ram area */ }; /* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 06f9291b6fd5..46020373e155 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -207,7 +207,7 @@ static inline void wait_on_inode(struct inode *inode) #include <linux/cgroup.h> #include <linux/bio.h> -void __inode_attach_wb(struct inode *inode, struct page *page); +void __inode_attach_wb(struct inode *inode, struct folio *folio); void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); @@ -222,16 +222,16 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** * inode_attach_wb - associate an inode with its wb * @inode: inode of interest - * @page: page being dirtied (may be NULL) + * @folio: folio being dirtied (may be NULL) * * If @inode doesn't have its wb, associate it with the wb matching the - * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * memcg of @folio or, if @folio is NULL, %current. May be called w/ or w/o * @inode->i_lock. */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) +static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { if (!inode->i_wb) - __inode_attach_wb(inode, page); + __inode_attach_wb(inode, folio); } /** @@ -290,7 +290,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) #else /* CONFIG_CGROUP_WRITEBACK */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) +static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { } @@ -366,11 +366,9 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, bool wb_over_bg_thresh(struct bdi_writeback *wb); -typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, +typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); -int generic_writepages(struct address_space *mapping, - struct writeback_control *wbc); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); int write_cache_pages(struct address_space *mapping, diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 44dd6d6e01bc..741703b45f61 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1643,7 +1643,8 @@ static inline void xas_set_order(struct xa_state *xas, unsigned long index, * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. - * This is advanced functionality and is only needed by the page cache. + * This is advanced functionality and is only needed by the page + * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 3d708dae1542..ef75ea606ab2 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -91,12 +91,38 @@ TRACE_EVENT(cma_alloc_start, __entry->align) ); -DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, +TRACE_EVENT(cma_alloc_finish, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned long count, unsigned int align), + unsigned long count, unsigned int align, int errorno), - TP_ARGS(name, pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align, errorno), + + TP_STRUCT__entry( + __string(name, name) + __field(unsigned long, pfn) + __field(const struct page *, page) + __field(unsigned long, count) + __field(unsigned int, align) + __field(int, errorno) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->pfn = pfn; + __entry->page = page; + __entry->count = count; + __entry->align = align; + __entry->errorno = errorno; + ), + + TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u errorno=%d", + __get_str(name), + __entry->pfn, + __entry->page, + __entry->count, + __entry->align, + __entry->errorno) ); DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 412b5a46374c..9db52bc4ce19 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -31,7 +31,6 @@ gfpflag_string(__GFP_HIGHMEM), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_HIGH), \ - gfpflag_string(__GFP_ATOMIC), \ gfpflag_string(__GFP_IO), \ gfpflag_string(__GFP_FS), \ gfpflag_string(__GFP_NOWARN), \ diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 2f86b2ad6d7e..e8c07da58c9f 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -43,6 +43,7 @@ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ #define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ +#define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */ /* (1U << 31) is reserved for signed error codes */ /* diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 7a8a26751c23..273a4e15dfcf 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -8,6 +8,10 @@ #define MFD_CLOEXEC 0x0001U #define MFD_ALLOW_SEALING 0x0002U #define MFD_HUGETLB 0x0004U +/* not executable and sealed to prevent changing to executable. */ +#define MFD_NOEXEC_SEAL 0x0008U +/* executable */ +#define MFD_EXEC 0x0010U /* * Huge page size encoding when MFD_HUGETLB is specified, and a huge page diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index a5e06dcbba13..1312a137f7fb 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -281,6 +281,12 @@ struct prctl_mm_map { # define PR_SME_VL_LEN_MASK 0xffff # define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ +/* Memory deny write / execute */ +#define PR_SET_MDWE 65 +# define PR_MDWE_REFUSE_EXEC_GAIN 1 + +#define PR_GET_MDWE 66 + #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 |