diff options
Diffstat (limited to 'mm/shmem.c')
| -rw-r--r-- | mm/shmem.c | 2676 |
1 files changed, 1872 insertions, 804 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 02e62fccc80d..3f194c9842a8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -40,9 +40,10 @@ #include <linux/fs_parser.h> #include <linux/swapfile.h> #include <linux/iversion.h> +#include <linux/unicode.h> #include "swap.h" -static struct vfsmount *shm_mnt; +static struct vfsmount *shm_mnt __ro_after_init; #ifdef CONFIG_SHMEM /* @@ -65,7 +66,7 @@ static struct vfsmount *shm_mnt; #include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/mempolicy.h> #include <linux/namei.h> #include <linux/ctype.h> @@ -79,12 +80,12 @@ static struct vfsmount *shm_mnt; #include <linux/rmap.h> #include <linux/uuid.h> #include <linux/quotaops.h> +#include <linux/rcupdate_wait.h> #include <linux/uaccess.h> #include "internal.h" -#define BLOCKS_PER_PAGE (PAGE_SIZE/512) #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ @@ -97,7 +98,7 @@ static struct vfsmount *shm_mnt; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ @@ -106,7 +107,7 @@ struct shmem_falloc { pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ - pgoff_t nr_unswapped; /* how often writepage refused to swap out */ + pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { @@ -122,14 +123,25 @@ struct shmem_options { bool noswap; unsigned short quota_types; struct shmem_quota_limits qlimits; +#if IS_ENABLED(CONFIG_UNICODE) + struct unicode_map *encoding; + bool strict_encoding; +#endif #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 -#define SHMEM_SEEN_NOSWAP 16 -#define SHMEM_SEEN_QUOTA 32 +#define SHMEM_SEEN_QUOTA 16 }; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long huge_shmem_orders_always __read_mostly; +static unsigned long huge_shmem_orders_madvise __read_mostly; +static unsigned long huge_shmem_orders_inherit __read_mostly; +static unsigned long huge_shmem_orders_within_size __read_mostly; +static bool shmem_orders_configured __initdata; +#endif + #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -146,9 +158,8 @@ static unsigned long shmem_default_max_inodes(void) #endif static int shmem_swapin_folio(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - vm_fault_t *fault_type); + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -163,20 +174,20 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { - return (flags & VM_NORESERVE) ? + return (flags & SHMEM_F_NORESERVE) ? 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { - if (!(flags & VM_NORESERVE)) + if (!(flags & SHMEM_F_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } static inline int shmem_reacct_size(unsigned long flags, loff_t oldsize, loff_t newsize) { - if (!(flags & VM_NORESERVE)) { + if (!(flags & SHMEM_F_NORESERVE)) { if (VM_ACCT(newsize) > VM_ACCT(oldsize)) return security_vm_enough_memory_mm(current->mm, VM_ACCT(newsize) - VM_ACCT(oldsize)); @@ -189,12 +200,12 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. - * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ -static inline int shmem_acct_block(unsigned long flags, long pages) +static inline int shmem_acct_blocks(unsigned long flags, long pages) { - if (!(flags & VM_NORESERVE)) + if (!(flags & SHMEM_F_NORESERVE)) return 0; return security_vm_enough_memory_mm(current->mm, @@ -203,30 +214,30 @@ static inline int shmem_acct_block(unsigned long flags, long pages) static inline void shmem_unacct_blocks(unsigned long flags, long pages) { - if (flags & VM_NORESERVE) + if (flags & SHMEM_F_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } -static int shmem_inode_acct_block(struct inode *inode, long pages) +int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int err = -ENOSPC; - if (shmem_acct_block(info->flags, pages)) + if (shmem_acct_blocks(info->flags, pages)) return err; might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - pages) > 0) + if (!percpu_counter_limited_add(&sbinfo->used_blocks, + sbinfo->max_blocks, pages)) goto unacct; err = dquot_alloc_block_nodirty(inode, pages); - if (err) + if (err) { + percpu_counter_sub(&sbinfo->used_blocks, pages); goto unacct; - - percpu_counter_add(&sbinfo->used_blocks, pages); + } } else { err = dquot_alloc_block_nodirty(inode, pages); if (err) @@ -254,7 +265,7 @@ static void shmem_inode_unacct_blocks(struct inode *inode, long pages) } static const struct super_operations shmem_ops; -const struct address_space_operations shmem_aops; +static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; @@ -263,18 +274,24 @@ static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; -bool vma_is_anon_shmem(struct vm_area_struct *vma) +bool shmem_mapping(const struct address_space *mapping) +{ + return mapping->a_ops == &shmem_aops; +} +EXPORT_SYMBOL_GPL(shmem_mapping); + +bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } -bool vma_is_shmem(struct vm_area_struct *vma) +bool vma_is_shmem(const struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); -static DEFINE_MUTEX(shmem_swaplist_mutex); +static DEFINE_SPINLOCK(shmem_swaplist_lock); #ifdef CONFIG_TMPFS_QUOTA @@ -311,7 +328,7 @@ static void shmem_disable_quotas(struct super_block *sb) dquot_quota_off(sb, type); } -static struct dquot **shmem_get_dquots(struct inode *inode) +static struct dquot __rcu **shmem_get_dquots(struct inode *inode) { return SHMEM_I(inode)->i_dquot; } @@ -414,10 +431,13 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) + * + * Return: true if swapped was incremented from 0, for shmem_writeout(). */ -static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) +bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); + bool first_swapped = false; long freed; spin_lock(&info->lock); @@ -428,12 +448,15 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), - * shmem_writepage() has to raise swapped before nrpages is lowered - + * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ - if (swapped > 0) + if (swapped > 0) { + if (info->swapped == swapped) + first_swapped = true; freed += swapped; + } if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); @@ -441,13 +464,14 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* The quota case may block */ if (freed > 0) shmem_inode_unacct_blocks(inode, freed); + return first_swapped; } bool shmem_charge(struct inode *inode, long pages) { struct address_space *mapping = inode->i_mapping; - if (shmem_inode_acct_block(inode, pages)) + if (shmem_inode_acct_blocks(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ @@ -487,15 +511,27 @@ static int shmem_replace_entry(struct address_space *mapping, /* * Sometimes, before we decide whether to proceed or to fail, we must check - * that an entry was not already brought back from swap by a racing thread. + * that an entry was not already brought back or split by a racing thread. * - * Checking page is not enough: by the time a SwapCache page is locked, it - * might be reused, and again be SwapCache, using the same swap as before. + * Checking folio is not enough: by the time a swapcache folio is locked, it + * might be reused, and again be swapcache, using the same swap as before. + * Returns the swap entry's order if it still presents, else returns -1. */ -static bool shmem_confirm_swap(struct address_space *mapping, - pgoff_t index, swp_entry_t swap) +static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, + swp_entry_t swap) { - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); + XA_STATE(xas, &mapping->i_pages, index); + int ret = -1; + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + if (entry == swp_to_radix_entry(swap)) + ret = xas_get_order(&xas); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + return ret; } /* @@ -507,9 +543,9 @@ static bool shmem_confirm_swap(struct address_space *mapping, * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, - * also respect fadvise()/madvise() hints; + * also respect madvise() hints; * SHMEM_HUGE_ADVISE: - * only allocate huge pages if requested with fadvise()/madvise(); + * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 @@ -533,58 +569,137 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ -static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; +#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE +#else +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER +#endif + +static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT; + +#undef SHMEM_HUGE_DEFAULT -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE +#else +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER +#endif + +static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT; + +#undef TMPFS_HUGE_DEFAULT + +static unsigned int shmem_get_orders_within_size(struct inode *inode, + unsigned long within_size_orders, pgoff_t index, + loff_t write_end) { + pgoff_t aligned_index; + unsigned long order; loff_t i_size; + order = highest_order(within_size_orders); + while (within_size_orders) { + aligned_index = round_up(index + 1, 1 << order); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= aligned_index) + return within_size_orders; + + order = next_order(&within_size_orders, order); + } + + return 0; +} + +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? + 0 : BIT(HPAGE_PMD_ORDER); + unsigned long within_size_orders; + if (!S_ISREG(inode->i_mode)) - return false; - if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) - return false; + return 0; if (shmem_huge == SHMEM_HUGE_DENY) - return false; + return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) - return true; + return maybe_pmd_order; + /* + * The huge order allocation for anon shmem is controlled through + * the mTHP interface, so we still use PMD-sized huge order to + * check whether global control is enabled. + * + * For tmpfs with 'huge=always' or 'huge=within_size' mount option, + * we will always try PMD-sized order first. If that failed, it will + * fall back to small large folios. + */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: - return true; + return THP_ORDERS_ALL_FILE_DEFAULT; case SHMEM_HUGE_WITHIN_SIZE: - index = round_up(index + 1, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) - return true; + within_size_orders = shmem_get_orders_within_size(inode, + THP_ORDERS_ALL_FILE_DEFAULT, index, write_end); + if (within_size_orders > 0) + return within_size_orders; + fallthrough; case SHMEM_HUGE_ADVISE: - if (mm && (vm_flags & VM_HUGEPAGE)) - return true; + if (vm_flags & VM_HUGEPAGE) + return THP_ORDERS_ALL_FILE_DEFAULT; fallthrough; default: - return false; + return 0; } } -#if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { + int huge; + + if (!str) + return -EINVAL; + if (!strcmp(str, "never")) - return SHMEM_HUGE_NEVER; - if (!strcmp(str, "always")) - return SHMEM_HUGE_ALWAYS; - if (!strcmp(str, "within_size")) - return SHMEM_HUGE_WITHIN_SIZE; - if (!strcmp(str, "advise")) - return SHMEM_HUGE_ADVISE; - if (!strcmp(str, "deny")) - return SHMEM_HUGE_DENY; - if (!strcmp(str, "force")) - return SHMEM_HUGE_FORCE; - return -EINVAL; + huge = SHMEM_HUGE_NEVER; + else if (!strcmp(str, "always")) + huge = SHMEM_HUGE_ALWAYS; + else if (!strcmp(str, "within_size")) + huge = SHMEM_HUGE_WITHIN_SIZE; + else if (!strcmp(str, "advise")) + huge = SHMEM_HUGE_ADVISE; + else if (!strcmp(str, "deny")) + huge = SHMEM_HUGE_DENY; + else if (!strcmp(str, "force")) + huge = SHMEM_HUGE_FORCE; + else + return -EINVAL; + + if (!has_transparent_hugepage() && + huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) + return -EINVAL; + + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (huge == SHMEM_HUGE_FORCE && + huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) + return -EINVAL; + + return huge; } -#endif #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) @@ -610,15 +725,14 @@ static const char *shmem_format_huge(int huge) #endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) + struct shrink_control *sc, unsigned long nr_to_free) { LIST_HEAD(list), *pos, *next; - LIST_HEAD(to_remove); struct inode *inode; struct shmem_inode_info *info; struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; - int split = 0; + unsigned long split = 0, freed = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -636,13 +750,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, goto next; } - /* Check if there's anything to gain */ - if (round_up(inode->i_size, PAGE_SIZE) == - round_up(inode->i_size, HPAGE_PMD_SIZE)) { - list_move(&info->shrinklist, &to_remove); - goto next; - } - list_move(&info->shrinklist, &list); next: sbinfo->shrinklist_len--; @@ -651,34 +758,36 @@ next: } spin_unlock(&sbinfo->shrinklist_lock); - list_for_each_safe(pos, next, &to_remove) { - info = list_entry(pos, struct shmem_inode_info, shrinklist); - inode = &info->vfs_inode; - list_del_init(&info->shrinklist); - iput(inode); - } - list_for_each_safe(pos, next, &list) { + pgoff_t next, end; + loff_t i_size; int ret; - pgoff_t index; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; - if (nr_to_split && split >= nr_to_split) + if (nr_to_free && freed >= nr_to_free) goto move_back; - index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; - folio = filemap_get_folio(inode->i_mapping, index); - if (IS_ERR(folio)) + i_size = i_size_read(inode); + folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); + if (!folio || xa_is_value(folio)) goto drop; - /* No huge page at the end of the file: nothing to split */ + /* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) { folio_put(folio); goto drop; } + /* Check if there is anything to gain from splitting */ + next = folio_next_index(folio); + end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); + if (end <= folio->index || end >= next) { + folio_put(folio); + goto drop; + } + /* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. @@ -699,6 +808,7 @@ next: if (ret) goto move_back; + freed += next - end; split++; drop: list_del_init(&info->shrinklist); @@ -742,90 +852,93 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_free) { - return false; + return 0; } -static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + vm_flags_t vm_flags) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static void shmem_update_stats(struct folio *folio, int nr_pages) +{ + if (folio_test_pmd_mappable(folio)) + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); +} + /* - * Like filemap_add_folio, but error if expected item has gone. + * Somewhat like filemap_add_folio, but error if expected item has gone. */ -static int shmem_add_to_page_cache(struct folio *folio, - struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp, - struct mm_struct *charge_mm) +int shmem_add_to_page_cache(struct folio *folio, + struct address_space *mapping, + pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); - long nr = folio_nr_pages(folio); - int error; + unsigned long nr = folio_nr_pages(folio); + swp_entry_t iter, swap; + void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - VM_BUG_ON(expected && folio_test_large(folio)); folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = index; - if (!folio_test_swapcache(folio)) { - error = mem_cgroup_charge(folio, charge_mm, gfp); - if (error) { - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto error; - } - } + gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); + swap = radix_to_swp_entry(expected); do { + iter = swap; xas_lock_irq(&xas); - if (expected != xas_find_conflict(&xas)) { - xas_set_err(&xas, -EEXIST); - goto unlock; + xas_for_each_conflict(&xas, entry) { + /* + * The range must either be empty, or filled with + * expected swap entries. Shmem swap entries are never + * partially freed without split of both entry and + * folio, so there shouldn't be any holes. + */ + if (!expected || entry != swp_to_radix_entry(iter)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + iter.val += 1 << xas_get_order(&xas); } - if (expected && xas_find_conflict(&xas)) { + if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_ALLOC); - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - } + shmem_update_stats(folio, nr); mapping->nrpages += nr; - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - error = xas_error(&xas); - goto error; + folio->mapping = NULL; + folio_ref_sub(folio, nr); + return xas_error(&xas); } return 0; -error: - folio->mapping = NULL; - folio_ref_sub(folio, nr); - return error; } /* - * Like delete_from_page_cache, but substitutes swap for @folio. + * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { @@ -837,26 +950,29 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); - folio_put(folio); + folio_put_refs(folio, nr); BUG_ON(error); } /* - * Remove swap entry from page cache, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. Returns + * the number of pages being freed. 0 means entry not found in XArray (0 pages + * being freed). */ -static int shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) +static long shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) { + int order = xa_get_order(&mapping->i_pages, index); void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) - return -ENOENT; - free_swap_and_cache(radix_to_swp_entry(radswap)); - return 0; + return 0; + free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + + return 1 << order; } /* @@ -870,16 +986,16 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) - swapped++; + if (xa_is_value(folio)) + swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) { @@ -887,7 +1003,6 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, cond_resched_rcu(); } } - rcu_read_unlock(); return swapped << PAGE_SHIFT; @@ -970,7 +1085,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * (although in some cases this is just a waste of time). */ folio = NULL; - shmem_get_folio(inode, index, &folio, SGP_READ); + shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } @@ -978,7 +1093,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, +static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; @@ -1009,7 +1124,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, + nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } @@ -1035,7 +1150,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { - same_folio = lend < folio_pos(folio) + folio_size(folio); + same_folio = lend < folio_next_pos(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); @@ -1076,14 +1191,17 @@ whole_folios: folio = fbatch.folios[i]; if (xa_is_value(folio)) { + long swaps_freed; + if (unfalloc) continue; - if (shmem_free_swap(mapping, indices[i], folio)) { + swaps_freed = shmem_free_swap(mapping, indices[i], folio); + if (!swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } - nr_swaps_freed++; + nr_swaps_freed += swaps_freed; continue; } @@ -1098,7 +1216,24 @@ whole_folios: } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); - truncate_inode_folio(mapping, folio); + + if (!folio_test_large(folio)) { + truncate_inode_folio(mapping, folio); + } else if (truncate_inode_partial_folio(folio, lstart, lend)) { + /* + * If we split a page, reset the loop so + * that we pick up the new sub pages. + * Otherwise the THP was entirely + * dropped or the target range was + * zeroed, so just continue the loop as + * is. + */ + if (!folio_test_large(folio)) { + folio_unlock(folio); + index = start; + break; + } + } } folio_unlock(folio); } @@ -1109,10 +1244,10 @@ whole_folios: shmem_recalc_inode(inode, 0, -nr_swaps_freed); } -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { shmem_undo_range(inode, lstart, lend, false); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -1138,7 +1273,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_is_huge(inode, 0, false, NULL, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1179,6 +1314,8 @@ static int shmem_setattr(struct mnt_idmap *idmap, return -EPERM; if (newsize != oldsize) { + if (info->flags & SHMEM_F_MAPPING_FROZEN) + return -EPERM; error = shmem_reacct_size(SHMEM_I(inode)->flags, oldsize, newsize); if (error) @@ -1213,7 +1350,6 @@ static int shmem_setattr(struct mnt_idmap *idmap, if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { error = dquot_transfer(idmap, inode, attr); - if (error) return error; } @@ -1224,7 +1360,7 @@ static int shmem_setattr(struct mnt_idmap *idmap, if (!error && update_ctime) { inode_set_ctime_current(inode); if (update_mtime) - inode->i_mtime = inode_get_ctime(inode); + inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); inode_inc_iversion(inode); } return error; @@ -1253,11 +1389,11 @@ static void shmem_evict_inode(struct inode *inode) /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); } } @@ -1271,9 +1407,9 @@ static void shmem_evict_inode(struct inode *inode) #endif } -static int shmem_find_swap_entries(struct address_space *mapping, - pgoff_t start, struct folio_batch *fbatch, - pgoff_t *indices, unsigned int type) +static unsigned int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, struct folio_batch *fbatch, + pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; @@ -1306,7 +1442,7 @@ static int shmem_find_swap_entries(struct address_space *mapping, } rcu_read_unlock(); - return xas.xa_index; + return folio_batch_count(fbatch); } /* @@ -1324,12 +1460,8 @@ static int shmem_unuse_swap_entries(struct inode *inode, for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - if (!xa_is_value(folio)) - continue; - error = shmem_swapin_folio(inode, indices[i], - &folio, SGP_CACHE, - mapping_gfp_mask(mapping), - NULL, NULL); + error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, + mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { folio_unlock(folio); folio_put(folio); @@ -1355,8 +1487,8 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type) do { folio_batch_init(&fbatch); - shmem_find_swap_entries(mapping, start, &fbatch, indices, type); - if (folio_batch_count(&fbatch) == 0) { + if (!shmem_find_swap_entries(mapping, start, &fbatch, + indices, type)) { ret = 0; break; } @@ -1384,7 +1516,8 @@ int shmem_unuse(unsigned int type) if (list_empty(&shmem_swaplist)) return 0; - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); +start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); @@ -1397,69 +1530,79 @@ int shmem_unuse(unsigned int type) * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); - mutex_lock(&shmem_swaplist_mutex); - next = list_next_entry(info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); + spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; + if (list_empty(&info->swaplist)) + goto start_over; + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); } - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); return error; } -/* - * Move the page from the page cache to the swap cache. +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @plug: swap plug + * @folio_list: list to put back folios on split + * + * Move the folio from the page cache to the swap cache. */ -static int shmem_writepage(struct page *page, struct writeback_control *wbc) +int shmem_writeout(struct folio *folio, struct swap_iocb **plug, + struct list_head *folio_list) { - struct folio *folio = page_folio(page); struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - swp_entry_t swap; pgoff_t index; + int nr_pages; + bool split = false; - /* - * Our capabilities prevent regular writeback or sync from ever calling - * shmem_writepage; but a stacking filesystem might use ->writepage of - * its underlying filesystem, in which case tmpfs should write out to - * swap only in response to memory pressure, and not for the writeback - * threads or sync. - */ - if (WARN_ON_ONCE(!wbc->for_reclaim)) - goto redirty; - - if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) + if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) goto redirty; /* - * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or - * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, - * and its shmem_writeback() needs them to be split when swapping. + * If CONFIG_THP_SWAP is not enabled, the large folio should be + * split when swapping. + * + * And shrinkage of pages beyond i_size does not split swap, so + * swapout of a large folio crossing i_size needs to split too + * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { + index = shmem_fallocend(inode, + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); + if ((index > folio->index && index < folio_next_index(folio)) || + !IS_ENABLED(CONFIG_THP_SWAP)) + split = true; + } + + if (split) { +try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page(page) < 0) + if (split_folio_to_list(folio, folio_list)) goto redirty; - folio = page_folio(page); folio_clear_dirty(folio); } index = folio->index; + nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC @@ -1481,7 +1624,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) - shmem_falloc->nr_unswapped++; + shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); @@ -1493,44 +1636,66 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) folio_mark_uptodate(folio); } - swap = folio_alloc_swap(folio); - if (!swap.val) - goto redirty; + if (!folio_alloc_swap(folio)) { + bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); + int error; + + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now before the folio is + * removed from page cache, when its pagelock no longer + * protects the inode from eviction. And do it now, after + * we've incremented swapped, because shmem_unuse() will + * prune a !swapped inode from the swaplist. + */ + if (first_swapped) { + spin_lock(&shmem_swaplist_lock); + if (list_empty(&info->swaplist)) + list_add(&info->swaplist, &shmem_swaplist); + spin_unlock(&shmem_swaplist_lock); + } + + swap_shmem_alloc(folio->swap, nr_pages); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); - /* - * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the folio is - * moved to swap cache, when its pagelock no longer protects - * the inode from eviction. But don't unlock the mutex until - * we've incremented swapped, because shmem_unuse_inode() will - * prune a !swapped inode from the swaplist under this mutex. - */ - mutex_lock(&shmem_swaplist_mutex); - if (list_empty(&info->swaplist)) - list_add(&info->swaplist, &shmem_swaplist); - - if (add_to_swap_cache(folio, swap, - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, - NULL) == 0) { - shmem_recalc_inode(inode, 0, 1); - swap_shmem_alloc(swap); - shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); - - mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - swap_writepage(&folio->page, wbc); - return 0; - } + error = swap_writeout(folio, plug); + if (error != AOP_WRITEPAGE_ACTIVATE) { + /* folio has been unlocked */ + return error; + } - mutex_unlock(&shmem_swaplist_mutex); - put_swap_folio(folio, swap); + /* + * The intention here is to avoid holding on to the swap when + * zswap was unable to compress and unable to writeback; but + * it will be appropriate if other reactivate cases are added. + */ + error = shmem_add_to_page_cache(folio, mapping, index, + swp_to_radix_entry(folio->swap), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + /* Swap entry might be erased by racing shmem_free_swap() */ + if (!error) { + shmem_recalc_inode(inode, 0, -nr_pages); + swap_free_nr(folio->swap, nr_pages); + } + + /* + * The swap_cache_del_folio() below could be left for + * shrink_folio_list()'s folio_free_swap() to dispose of; + * but I'm a little nervous about letting this folio out of + * shmem_writeout() in a hybrid half-tmpfs-half-swap state + * e.g. folio_mapping(folio) might give an unexpected answer. + */ + swap_cache_del_folio(folio); + goto redirty; + } + if (nr_pages > 1) + goto try_split; redirty: folio_mark_dirty(folio); - if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ - folio_unlock(folio); - return 0; + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } +EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) @@ -1565,42 +1730,22 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ -#ifndef CONFIG_NUMA -#define vm_policy vm_private_data -#endif -static void shmem_pseudo_vma_init(struct vm_area_struct *vma, - struct shmem_inode_info *info, pgoff_t index) -{ - /* Create a pseudo vma that just contains the policy */ - vma_init(vma, NULL); - /* Bias interleave by inode number to distribute better across nodes */ - vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); -} - -static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) -{ - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(vma->vm_policy); -} +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx); -static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct page *page; - struct vm_fault vmf = { - .vma = &pvma, - }; + struct mempolicy *mpol; + pgoff_t ilx; + struct folio *folio; - shmem_pseudo_vma_init(&pvma, info, index); - page = swap_cluster_readahead(swap, gfp, &vmf); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + folio = swap_cluster_readahead(swap, gfp, mpol, ilx); + mpol_cond_put(mpol); - if (!page) - return NULL; - return page_folio(page); + return folio; } /* @@ -1627,70 +1772,316 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) return result; } -static struct folio *shmem_alloc_hugefolio(gfp_t gfp, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +bool shmem_hpage_pmd_enabled(void) +{ + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) && + shmem_huge != SHMEM_HUGE_NEVER) + return true; + + return false; +} + +unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + loff_t write_end, bool shmem_huge_force) +{ + unsigned long mask = READ_ONCE(huge_shmem_orders_always); + unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); + vm_flags_t vm_flags = vma ? vma->vm_flags : 0; + unsigned int global_orders; + + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) + return 0; + + global_orders = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); + /* Tmpfs huge pages allocation */ + if (!vma || !vma_is_anon_shmem(vma)) + return global_orders; + + /* + * Following the 'deny' semantics of the top level, force the huge + * option off from all mounts. + */ + if (shmem_huge == SHMEM_HUGE_DENY) + return 0; + + /* + * Only allow inherit orders if the top-level value is 'force', which + * means non-PMD sized THP can not override 'huge' mount option now. + */ + if (shmem_huge == SHMEM_HUGE_FORCE) + return READ_ONCE(huge_shmem_orders_inherit); + + /* Allow mTHP that will be fully within i_size. */ + mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_shmem_orders_madvise); + + if (global_orders > 0) + mask |= READ_ONCE(huge_shmem_orders_inherit); + + return THP_ORDERS_ALL_FILE_DEFAULT & mask; +} + +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + pgoff_t aligned_index; + unsigned long pages; + int order; + + if (vma) { + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + if (!orders) + return 0; + } + + /* Find the highest order that can add into the page cache */ + order = highest_order(orders); + while (orders) { + pages = 1UL << order; + aligned_index = round_down(index, pages); + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ + if (!xa_find(&mapping->i_pages, &aligned_index, + aligned_index + pages - 1, XA_PRESENT)) + break; + order = next_order(&orders, order); + } + + return orders; +} +#else +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct address_space *mapping = info->vfs_inode.i_mapping; - pgoff_t hindex; + struct mempolicy *mpol; + pgoff_t ilx; struct folio *folio; - hindex = round_down(index, HPAGE_PMD_NR); - if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, - XA_PRESENT)) - return NULL; + mpol = shmem_get_pgoff_policy(info, index, order, &ilx); + folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - shmem_pseudo_vma_init(&pvma, info, hindex); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); - shmem_pseudo_vma_destroy(&pvma); - if (!folio) - count_vm_event(THP_FILE_FALLBACK); return folio; } -static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) +static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, + gfp_t gfp, struct inode *inode, pgoff_t index, + struct mm_struct *fault_mm, unsigned long orders) { - struct vm_area_struct pvma; - struct folio *folio; + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long suitable_orders = 0; + struct folio *folio = NULL; + pgoff_t aligned_index; + long pages; + int error, order; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + orders = 0; + + if (orders > 0) { + suitable_orders = shmem_suitable_orders(inode, vmf, + mapping, index, orders); + + order = highest_order(suitable_orders); + while (suitable_orders) { + pages = 1UL << order; + aligned_index = round_down(index, pages); + folio = shmem_alloc_folio(gfp, order, info, aligned_index); + if (folio) { + index = aligned_index; + goto allocated; + } + + if (pages == HPAGE_PMD_NR) + count_vm_event(THP_FILE_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); + order = next_order(&suitable_orders, order); + } + } else { + pages = 1; + folio = shmem_alloc_folio(gfp, 0, info, index); + } + if (!folio) + return ERR_PTR(-ENOMEM); + +allocated: + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + gfp &= GFP_RECLAIM_MASK; + error = mem_cgroup_charge(folio, fault_mm, gfp); + if (error) { + if (xa_find(&mapping->i_pages, &index, + index + pages - 1, XA_PRESENT)) { + error = -EEXIST; + } else if (pages > 1) { + if (pages == HPAGE_PMD_NR) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); + } + goto unlock; + } - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); - shmem_pseudo_vma_destroy(&pvma); + error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); + if (error) + goto unlock; + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + long freed; + /* + * Try to reclaim some space by splitting a few + * large folios beyond i_size on the filesystem. + */ + shmem_unused_huge_shrink(sbinfo, NULL, pages); + /* + * And do a shmem_recalc_inode() to account for freed pages: + * except our folio is there in cache, so not quite balanced. + */ + spin_lock(&info->lock); + freed = pages + info->alloced - info->swapped - + READ_ONCE(mapping->nrpages); + if (freed > 0) + info->alloced -= freed; + spin_unlock(&info->lock); + if (freed > 0) + shmem_inode_unacct_blocks(inode, freed); + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + filemap_remove_folio(folio); + goto unlock; + } + } + + shmem_recalc_inode(inode, pages, 0); + folio_add_lru(folio); return folio; + +unlock: + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(error); } -static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge) +static struct folio *shmem_swap_alloc_folio(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); - struct folio *folio; - int nr; - int err; + int nr_pages = 1 << order; + struct folio *new; + gfp_t alloc_gfp; + void *shadow; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - huge = false; - nr = huge ? HPAGE_PMD_NR : 1; + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success with further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + alloc_gfp = gfp; + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (WARN_ON_ONCE(order)) + return ERR_PTR(-EINVAL); + } else if (order) { + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. + */ + if ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled() || + non_swapcache_batch(entry, nr_pages) != nr_pages) + goto fallback; - err = shmem_inode_acct_block(inode, nr); - if (err) - goto failed; + alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + } +retry: + new = shmem_alloc_folio(alloc_gfp, order, info, index); + if (!new) { + new = ERR_PTR(-ENOMEM); + goto fallback; + } - if (huge) - folio = shmem_alloc_hugefolio(gfp, info, index); - else - folio = shmem_alloc_folio(gfp, info, index); - if (folio) { - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - return folio; + if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, + alloc_gfp, entry)) { + folio_put(new); + new = ERR_PTR(-ENOMEM); + goto fallback; } - err = -ENOMEM; - shmem_inode_unacct_blocks(inode, nr); -failed: - return ERR_PTR(err); + /* + * Prevent parallel swapin from proceeding with the swap cache flag. + * + * Of course there is another possible concurrent scenario as well, + * that is to say, the swap cache flag of a large folio has already + * been set by swapcache_prepare(), while another thread may have + * already split the large swap entry stored in the shmem mapping. + * In this case, shmem_add_to_page_cache() will help identify the + * concurrent swapin and return -EEXIST. + */ + if (swapcache_prepare(entry, nr_pages)) { + folio_put(new); + new = ERR_PTR(-EEXIST); + /* Try smaller folio to avoid cache conflict */ + goto fallback; + } + + __folio_set_locked(new); + __folio_set_swapbacked(new); + new->swap = entry; + + memcg1_swapin(entry, nr_pages); + shadow = swap_cache_get_shadow(entry); + if (shadow) + workingset_refault(new, shadow); + folio_add_lru(new); + swap_read_folio(new, NULL); + return new; +fallback: + /* Order 0 swapin failed, nothing to fallback to, abort */ + if (!order) + return new; + entry.val += index - round_down(index, nr_pages); + alloc_gfp = gfp; + nr_pages = 1; + order = 0; + goto retry; } /* @@ -1711,30 +2102,33 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + struct vm_area_struct *vma) { - struct folio *old, *new; - struct address_space *swap_mapping; - swp_entry_t entry; - pgoff_t swap_index; - int error; - - old = *foliop; - entry = old->swap; - swap_index = swp_offset(entry); - swap_mapping = swap_address_space(entry); + struct swap_cluster_info *ci; + struct folio *new, *old = *foliop; + swp_entry_t entry = old->swap; + int nr_pages = folio_nr_pages(old); + int error = 0; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, info, index); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages > 1) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } +#endif + + new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; - folio_get(new); + folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); @@ -1744,47 +2138,37 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - /* - * Our caller will very soon move newpage out of swapcache, but it's - * a nice clean interface for us to replace oldpage by newpage there. - */ - xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, old, new); - if (!error) { - mem_cgroup_migrate(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); - __lruvec_stat_mod_folio(new, NR_SHMEM, 1); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); - __lruvec_stat_mod_folio(old, NR_SHMEM, -1); - } - xa_unlock_irq(&swap_mapping->i_pages); - - if (unlikely(error)) { - /* - * Is this possible? I think not, now that our callers check - * both PageSwapCache and page_private after getting page lock; - * but be defensive. Reverse old to newpage for clear and free. - */ - old = new; - } else { - folio_add_lru(new); - *foliop = new; - } + ci = swap_cluster_get_and_lock_irq(old); + __swap_cache_replace_folio(ci, old, new); + mem_cgroup_replace_folio(old, new); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); + swap_cluster_unlock_irq(ci); + + folio_add_lru(new); + *foliop = new; folio_clear_swapcache(old); old->private = NULL; folio_unlock(old); - folio_put_refs(old, 2); + /* + * The old folio are removed from swap cache, drop the 'nr_pages' + * reference, as well as one temporary reference getting from swap + * cache. + */ + folio_put_refs(old, nr_pages + 1); return error; } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, - struct folio *folio, swp_entry_t swap) + struct folio *folio, swp_entry_t swap, + bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; + int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, @@ -1793,15 +2177,89 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, if (old != swp_to_radix_entry(swap)) return; + nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); - delete_from_swap_cache(folio); + if (!skip_swapcache) + swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ - shmem_recalc_inode(inode, -1, -1); - swap_free(swap); + shmem_recalc_inode(inode, -nr_pages, -nr_pages); + swap_free_nr(swap, nr_pages); +} + +static int shmem_split_large_entry(struct inode *inode, pgoff_t index, + swp_entry_t swap, gfp_t gfp) +{ + struct address_space *mapping = inode->i_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); + int split_order = 0; + int i; + + /* Convert user data gfp flags to xarray node gfp flags */ + gfp &= GFP_RECLAIM_MASK; + + for (;;) { + void *old = NULL; + int cur_order; + pgoff_t swap_index; + + xas_lock_irq(&xas); + old = xas_load(&xas); + if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + + cur_order = xas_get_order(&xas); + if (!cur_order) + goto unlock; + + /* Try to split large swap entry in pagecache */ + swap_index = round_down(index, 1 << cur_order); + split_order = xas_try_split_min_order(cur_order); + + while (cur_order > 0) { + pgoff_t aligned_index = + round_down(index, 1 << cur_order); + pgoff_t swap_offset = aligned_index - swap_index; + + xas_set_order(&xas, index, split_order); + xas_try_split(&xas, old, cur_order); + if (xas_error(&xas)) + goto unlock; + + /* + * Re-set the swap entry after splitting, and the swap + * offset of the original large entry must be continuous. + */ + for (i = 0; i < 1 << cur_order; + i += (1 << split_order)) { + swp_entry_t tmp; + + tmp = swp_entry(swp_type(swap), + swp_offset(swap) + swap_offset + + i); + __xa_store(&mapping->i_pages, aligned_index + i, + swp_to_radix_entry(tmp), 0); + } + cur_order = split_order; + split_order = xas_try_split_min_order(split_order); + } + +unlock: + xas_unlock_irq(&xas); + + if (!xas_nomem(&xas, gfp)) + break; + } + + if (xas_error(&xas)) + return xas_error(&xas); + + return 0; } /* @@ -1816,50 +2274,114 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; + struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); - struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; + swp_entry_t swap; + softleaf_t index_entry; struct swap_info_struct *si; struct folio *folio = NULL; - swp_entry_t swap; - int error; + bool skip_swapcache = false; + int error, nr_pages, order; + pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); - swap = radix_to_swp_entry(*foliop); + index_entry = radix_to_swp_entry(*foliop); + swap = index_entry; *foliop = NULL; - if (is_poisoned_swp_entry(swap)) + if (softleaf_is_poison_marker(index_entry)) return -EIO; - si = get_swap_device(swap); - if (!si) { - if (!shmem_confirm_swap(mapping, index, swap)) + si = get_swap_device(index_entry); + order = shmem_confirm_swap(mapping, index, index_entry); + if (unlikely(!si)) { + if (order < 0) return -EEXIST; else return -EINVAL; } + if (unlikely(order < 0)) { + put_swap_device(si); + return -EEXIST; + } + + /* index may point to the middle of a large entry, get the sub entry */ + if (order) { + offset = index - round_down(index, 1 << order); + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } /* Look it up and read it in.. */ - folio = swap_cache_get_folio(swap, NULL, 0); + folio = swap_cache_get_folio(swap); if (!folio) { - /* Or update major stats only when swapin succeeds?? */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + /* Direct swapin skipping swap cache & readahead */ + folio = shmem_swap_alloc_folio(inode, vma, index, + index_entry, order, gfp); + if (IS_ERR(folio)) { + error = PTR_ERR(folio); + folio = NULL; + goto failed; + } + skip_swapcache = true; + } else { + /* Cached swapin only supports order 0 folio */ + folio = shmem_swapin_cluster(swap, gfp, info, index); + if (!folio) { + error = -ENOMEM; + goto failed; + } + } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - count_memcg_event_mm(charge_mm, PGMAJFAULT); - } - /* Here we actually start the io */ - folio = shmem_swapin(swap, gfp, info, index); - if (!folio) { - error = -ENOMEM; - goto failed; + count_memcg_event_mm(fault_mm, PGMAJFAULT); } + } else { + swap_update_readahead(folio, NULL, 0); + } + + if (order > folio_order(folio)) { + /* + * Swapin may get smaller folios due to various reasons: + * It may fallback to order 0 due to memory pressure or race, + * swap readahead may swap in order 0 folios into swapcache + * asynchronously, while the shmem mapping can still stores + * large swap entries. In such cases, we should split the + * large swap entry to prevent possible data corruption. + */ + error = shmem_split_large_entry(inode, index, index_entry, gfp); + if (error) + goto failed_nolock; } - /* We have to do this with folio locked to prevent races */ + /* + * If the folio is large, round down swap and index by folio size. + * No matter what race occurs, the swap layer ensures we either get + * a valid folio that has its swap entry aligned by size, or a + * temporarily invalid one which we'll abort very soon and retry. + * + * shmem_add_to_page_cache ensures the whole range contains expected + * entries and prevents any corruption, so any race split is fine + * too, it will succeed as long as the entries are still there. + */ + nr_pages = folio_nr_pages(folio); + if (nr_pages > 1) { + swap.val = round_down(swap.val, nr_pages); + index = round_down(index, nr_pages); + } + + /* + * We have to do this with the folio locked to prevent races. + * The shmem_confirm_swap below only checks if the first swap + * entry matches the folio, that's enough to ensure the folio + * is not used outside of shmem, as shmem swap entries + * and swap cache folios are never partially freed. + */ folio_lock(folio); - if (!folio_test_swapcache(folio) || - folio->swap.val != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { + if ((!skip_swapcache && !folio_test_swapcache(folio)) || + shmem_confirm_swap(mapping, index, swap) < 0 || + folio->swap.val != swap.val) { error = -EEXIST; goto unlock; } @@ -1873,42 +2395,50 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ - arch_swap_restore(swap, folio); + arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_folio(&folio, gfp, info, index); + error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } error = shmem_add_to_page_cache(folio, mapping, index, - swp_to_radix_entry(swap), gfp, - charge_mm); + swp_to_radix_entry(swap), gfp); if (error) goto failed; - shmem_recalc_inode(inode, 0, -1); + shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); - delete_from_swap_cache(folio); + if (skip_swapcache) { + folio->swap.val = 0; + swapcache_clear(si, swap, nr_pages); + } else { + swap_cache_del_folio(folio); + } folio_mark_dirty(folio); - swap_free(swap); + swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; return 0; failed: - if (!shmem_confirm_swap(mapping, index, swap)) + if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) - shmem_set_folio_swapin_error(inode, index, folio, swap); + shmem_set_folio_swapin_error(inode, index, folio, swap, + skip_swapcache); unlock: - if (folio) { + if (folio) folio_unlock(folio); +failed_nolock: + if (skip_swapcache) + swapcache_clear(si, folio->swap, folio_nr_pages(folio)); + if (folio) folio_put(folio); - } put_swap_device(si); return error; @@ -1921,37 +2451,33 @@ unlock: * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vma, vmf, and fault_type are only supplied by shmem_fault: - * otherwise they are NULL. + * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) + loff_t write_end, struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { - struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo; - struct mm_struct *charge_mm; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + struct mm_struct *fault_mm; struct folio *folio; - pgoff_t hindex; - gfp_t huge_gfp; int error; - int once = 0; - int alloced = 0; + bool alloced; + unsigned long orders = 0; + + if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) + return -EINVAL; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: if (sgp <= SGP_CACHE && - ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; - } - sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : NULL; + alloced = false; + fault_mm = vma ? vma->vm_mm : NULL; - folio = filemap_get_entry(mapping, index); + folio = filemap_get_entry(inode->i_mapping, index); if (folio && vma && userfaultfd_minor(vma)) { if (!xa_is_value(folio)) folio_put(folio); @@ -1961,7 +2487,7 @@ repeat: if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, vma, fault_type); + sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; @@ -1973,7 +2499,7 @@ repeat: folio_lock(folio); /* Has the folio been truncated or swapped out? */ - if (unlikely(folio->mapping != mapping)) { + if (unlikely(folio->mapping != inode->i_mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; @@ -2008,58 +2534,41 @@ repeat: return 0; } - if (!shmem_is_huge(inode, index, false, - vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) - goto alloc_nohuge; - - huge_gfp = vma_thp_gfp_mask(vma); - huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); - if (IS_ERR(folio)) { -alloc_nohuge: - folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); + /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ + orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); + if (orders > 0) { + gfp_t huge_gfp; + + huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); + folio = shmem_alloc_and_add_folio(vmf, huge_gfp, + inode, index, fault_mm, orders); + if (!IS_ERR(folio)) { + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_FILE_ALLOC); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); + goto alloced; + } + if (PTR_ERR(folio) == -EEXIST) + goto repeat; } - if (IS_ERR(folio)) { - int retry = 5; + folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); + if (IS_ERR(folio)) { error = PTR_ERR(folio); + if (error == -EEXIST) + goto repeat; folio = NULL; - if (error != -ENOSPC) - goto unlock; - /* - * Try to reclaim some space by splitting a large folio - * beyond i_size on the filesystem. - */ - while (retry--) { - int ret; - - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); - if (ret == SHRINK_STOP) - break; - if (ret) - goto alloc_nohuge; - } goto unlock; } - hindex = round_down(index, folio_nr_pages(folio)); - - if (sgp == SGP_WRITE) - __folio_set_referenced(folio); - - error = shmem_add_to_page_cache(folio, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK, - charge_mm); - if (error) - goto unacct; - - folio_add_lru(folio); - shmem_recalc_inode(inode, folio_nr_pages(folio), 0); +alloced: alloced = true; - - if (folio_test_pmd_mappable(folio) && + if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - folio_next_index(folio) - 1) { + folio_next_index(folio)) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. @@ -2077,6 +2586,8 @@ alloc_nohuge: spin_unlock(&sbinfo->shrinklist_lock); } + if (sgp == SGP_WRITE) + folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ @@ -2100,11 +2611,6 @@ clear: /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { - if (alloced) { - folio_clear_dirty(folio); - filemap_remove_folio(folio); - shmem_recalc_inode(inode, 0, 0); - } error = -EINVAL; goto unlock; } @@ -2115,121 +2621,147 @@ out: /* * Error recovery. */ -unacct: - shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); - - if (folio_test_large(folio)) { - folio_unlock(folio); - folio_put(folio); - goto alloc_nohuge; - } unlock: + if (alloced) + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); if (folio) { folio_unlock(folio); folio_put(folio); } - if (error == -ENOSPC && !once++) { - shmem_recalc_inode(inode, 0, 0); - goto repeat; - } - if (error == -EEXIST) - goto repeat; return error; } -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp) +/** + * shmem_get_folio - find, and lock a shmem folio. + * @inode: inode to search + * @index: the page index. + * @write_end: end of a write, could extend inode size + * @foliop: pointer to the folio if found + * @sgp: SGP_* flags to control behavior + * + * Looks up the page cache entry at @inode & @index. If a folio is + * present, it is returned locked with an increased refcount. + * + * If the caller modifies data in the folio, it must call folio_mark_dirty() + * before unlocking the folio to ensure that the folio is not reclaimed. + * There is no need to reserve space before calling folio_mark_dirty(). + * + * When no folio is found, the behavior depends on @sgp: + * - for SGP_READ, *@foliop is %NULL and 0 is returned + * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned + * - for all other flags a new folio is allocated, inserted into the + * page cache and returned locked in @foliop. + * + * Context: May sleep. + * Return: 0 if successful, else a negative error code. + */ +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp) { - return shmem_get_folio_gfp(inode, index, foliop, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); + return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL); } +EXPORT_SYMBOL_GPL(shmem_get_folio); /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the * target. */ -static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->entry); return ret; } +/* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_rwsem. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_rwsem in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ +static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) +{ + struct shmem_falloc *shmem_falloc; + struct file *fpin = NULL; + vm_fault_t ret = 0; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); + + ret = VM_FAULT_NOPAGE; + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + } + spin_unlock(&inode->i_lock); + if (fpin) { + fput(fpin); + ret = VM_FAULT_RETRY; + } + return ret; +} + static vm_fault_t shmem_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = vmf->vma; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; + vm_fault_t ret = 0; int err; - vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can - * prevent the hole-punch from ever completing: which in turn - * locks writers out with its hold on i_rwsem. So refrain from - * faulting pages into the hole while it's being punched. Although - * shmem_undo_range() does remove the additions, it may be unable to - * keep up, as each new page needs its own unmap_mapping_range() call, - * and the i_mmap tree grows ever slower to scan if new vmas are added. - * - * It does not matter if we sometimes reach this check just before the - * hole-punch begins, so that one fault then races with the punch: - * we just need to make racing faults a rare case. - * - * The implementation below would be much simpler if we just used a - * standard mutex or completion: but we cannot take i_rwsem in fault, - * and bloating every shmem inode for this unlikely case would be sad. + * prevent the hole-punch from ever completing: noted in i_private. */ if (unlikely(inode->i_private)) { - struct shmem_falloc *shmem_falloc; - - spin_lock(&inode->i_lock); - shmem_falloc = inode->i_private; - if (shmem_falloc && - shmem_falloc->waitq && - vmf->pgoff >= shmem_falloc->start && - vmf->pgoff < shmem_falloc->next) { - struct file *fpin; - wait_queue_head_t *shmem_falloc_waitq; - DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); - - ret = VM_FAULT_NOPAGE; - fpin = maybe_unlock_mmap_for_io(vmf, NULL); - if (fpin) - ret = VM_FAULT_RETRY; - - shmem_falloc_waitq = shmem_falloc->waitq; - prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); - schedule(); - - /* - * shmem_falloc_waitq points into the shmem_fallocate() - * stack of the hole-punching task: shmem_falloc_waitq - * is usually invalid by the time we reach here, but - * finish_wait() does not dereference it in that case; - * though i_lock needed lest racing with wake_up_all(). - */ - spin_lock(&inode->i_lock); - finish_wait(shmem_falloc_waitq, &shmem_fault_wait); - spin_unlock(&inode->i_lock); - - if (fpin) - fput(fpin); + ret = shmem_falloc_wait(vmf, inode); + if (ret) return ret; - } - spin_unlock(&inode->i_lock); } - err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, - gfp, vma, vmf, &ret); + WARN_ON_ONCE(vmf->page != NULL); + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, + gfp, vmf, &ret); if (err) return vmf_error(err); - if (folio) + if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); + ret |= VM_FAULT_LOCKED; + } return ret; } @@ -2237,19 +2769,17 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long (*get_area)(struct file *, - unsigned long, unsigned long, unsigned long, unsigned long); unsigned long addr; unsigned long offset; unsigned long inflated_len; unsigned long inflated_addr; unsigned long inflated_offset; + unsigned long hpage_size; if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; - addr = get_area(file, uaddr, len, pgoff, flags); + addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2262,8 +2792,6 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (shmem_huge == SHMEM_HUGE_DENY) return addr; - if (len < HPAGE_PMD_SIZE) - return addr; if (flags & MAP_FIXED) return addr; /* @@ -2275,8 +2803,11 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (uaddr == addr) return addr; + hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; + unsigned long __maybe_unused hpage_orders; + int order = 0; if (file) { VM_BUG_ON(file->f_op != &shmem_file_operations); @@ -2289,33 +2820,53 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (IS_ERR(shm_mnt)) return addr; sb = shm_mnt->mnt_sb; + + /* + * Find the highest mTHP order used for anonymous shmem to + * provide a suitable alignment address. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + hpage_orders = READ_ONCE(huge_shmem_orders_always); + hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); + hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); + + if (hpage_orders > 0) { + order = highest_order(hpage_orders); + hpage_size = PAGE_SIZE << order; + } +#endif } - if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) + if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr; } - offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); - if (offset && offset + len < 2 * HPAGE_PMD_SIZE) + if (len < hpage_size) return addr; - if ((addr & (HPAGE_PMD_SIZE-1)) == offset) + + offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); + if (offset && offset + len < 2 * hpage_size) + return addr; + if ((addr & (hpage_size - 1)) == offset) return addr; - inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; + inflated_len = len + hpage_size - PAGE_SIZE; if (inflated_len > TASK_SIZE) return addr; if (inflated_len < len) return addr; - inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) return addr; - inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); + inflated_offset = inflated_addr & (hpage_size - 1); inflated_addr += offset - inflated_offset; if (inflated_offset > offset) - inflated_addr += HPAGE_PMD_SIZE; + inflated_addr += hpage_size; if (inflated_addr > TASK_SIZE - len) return addr; @@ -2330,15 +2881,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; + /* + * Bias interleave by inode number to distribute better across nodes; + * but this interface is independent of which page order is used, so + * supplies only that bias, letting caller apply the offset (adjusted + * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). + */ + *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } -#endif + +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + struct mempolicy *mpol; + + /* Bias interleave by inode number to distribute better across nodes */ + *ilx = info->vfs_inode.i_ino + (index >> order); + + mpol = mpol_shared_policy_lookup(&info->policy, index); + return mpol ? mpol : get_task_policy(current); +} +#else +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} +#endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { @@ -2351,15 +2928,15 @@ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) * ipc_lock_object() when called from shmctl_do_lock(), * no serialization needed when called from shm_destroy(). */ - if (lock && !(info->flags & VM_LOCKED)) { + if (lock && !(info->flags & SHMEM_F_LOCKED)) { if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; - info->flags |= VM_LOCKED; + info->flags |= SHMEM_F_LOCKED; mapping_set_unevictable(file->f_mapping); } - if (!lock && (info->flags & VM_LOCKED) && ucounts) { + if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) { user_shm_unlock(inode->i_size, ucounts); - info->flags &= ~VM_LOCKED; + info->flags &= ~SHMEM_F_LOCKED; mapping_clear_unevictable(file->f_mapping); } retval = 0; @@ -2368,25 +2945,17 @@ out_nomem: return retval; } -static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +static int shmem_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); - int ret; - - ret = seal_check_future_write(info->seals, vma); - if (ret) - return ret; - - /* arm64 - allow memory tagging on RAM-based files */ - vm_flags_set(vma, VM_MTE_ALLOWED); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink) - vma->vm_ops = &shmem_vm_ops; + desc->vm_ops = &shmem_vm_ops; else - vma->vm_ops = &shmem_anon_vm_ops; + desc->vm_ops = &shmem_anon_vm_ops; return 0; } @@ -2399,13 +2968,62 @@ static int shmem_file_open(struct inode *inode, struct file *file) #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); +#if IS_ENABLED(CONFIG_UNICODE) +/* + * shmem_inode_casefold_flags - Deal with casefold file attribute flag + * + * The casefold file attribute needs some special checks. I can just be added to + * an empty dir, and can't be removed from a non-empty dir. + */ +static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, + struct dentry *dentry, unsigned int *i_flags) +{ + unsigned int old = inode->i_flags; + struct super_block *sb = inode->i_sb; + + if (fsflags & FS_CASEFOLD_FL) { + if (!(old & S_CASEFOLD)) { + if (!sb->s_encoding) + return -EOPNOTSUPP; + + if (!S_ISDIR(inode->i_mode)) + return -ENOTDIR; + + if (dentry && !simple_empty(dentry)) + return -ENOTEMPTY; + } + + *i_flags = *i_flags | S_CASEFOLD; + } else if (old & S_CASEFOLD) { + if (dentry && !simple_empty(dentry)) + return -ENOTEMPTY; + } + + return 0; +} +#else +static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, + struct dentry *dentry, unsigned int *i_flags) +{ + if (fsflags & FS_CASEFOLD_FL) + return -EOPNOTSUPP; + + return 0; +} +#endif + /* * chattr's fsflags are unrelated to extended attributes, * but tmpfs has chosen to enable them under the same config option. */ -static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { unsigned int i_flags = 0; + int ret; + + ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); + if (ret) + return ret; if (fsflags & FS_NOATIME_FL) i_flags |= S_NOATIME; @@ -2416,10 +3034,12 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) /* * But FS_NODUMP_FL does not require any action in i_flags. */ - inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); + inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD); + + return 0; } #else -static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { } #define shmem_initxattrs NULL @@ -2445,7 +3065,6 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, if (err) return ERR_PTR(err); - inode = new_inode(sb); if (!inode) { shmem_free_inode(sb, 0); @@ -2455,27 +3074,29 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, inode->i_ino = ino; inode_init_owner(idmap, inode, dir, mode); inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_generation = get_random_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; - info->flags = flags & VM_NORESERVE; - info->i_crtime = inode->i_mtime; + info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; + info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; if (info->fsflags) - shmem_set_inode_flags(inode, info->fsflags); + shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); - INIT_LIST_HEAD(&info->swaplist); - if (sbinfo->noswap) - mapping_set_unevictable(inode->i_mapping); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); - mapping_set_large_folios(inode->i_mapping); + if (sbinfo->noswap) + mapping_set_unevictable(inode->i_mapping); + + /* Don't consider 'deny' for emergencies and 'force' for testing */ + if (sbinfo->huge) + mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: @@ -2565,7 +3186,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, int ret; pgoff_t max_off; - if (shmem_inode_acct_block(inode, 1)) { + if (shmem_inode_acct_blocks(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to @@ -2580,7 +3201,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, info, pgoff); + folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks; @@ -2637,8 +3258,10 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (unlikely(pgoff >= max_off)) goto out_release; - ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm); + ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); + if (ret) + goto out_release; + ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (ret) goto out_release; @@ -2666,9 +3289,9 @@ static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int -shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata) +shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -2685,28 +3308,29 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) && + pos + len > inode->i_size)) + return -EPERM; + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; - *pagep = folio_file_page(folio, index); - if (PageHWPoison(*pagep)) { + if (folio_contain_hwpoisoned_page(folio)) { folio_unlock(folio); folio_put(folio); - *pagep = NULL; return -EIO; } + *foliop = folio; return 0; } static int -shmem_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; if (pos + copied > inode->i_size) @@ -2736,28 +3360,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) unsigned long offset; int error = 0; ssize_t retval = 0; - loff_t *ppos = &iocb->ki_pos; - - index = *ppos >> PAGE_SHIFT; - offset = *ppos & ~PAGE_MASK; for (;;) { struct folio *folio = NULL; struct page *page = NULL; - pgoff_t end_index; unsigned long nr, ret; - loff_t i_size = i_size_read(inode); + loff_t end_offset, i_size = i_size_read(inode); + bool fallback_page_copy = false; + size_t fsize; - end_index = i_size >> PAGE_SHIFT; - if (index > end_index) + if (unlikely(iocb->ki_pos >= i_size)) break; - if (index == end_index) { - nr = i_size & ~PAGE_MASK; - if (nr <= offset) - break; - } - error = shmem_get_folio(inode, index, &folio, SGP_READ); + index = iocb->ki_pos >> PAGE_SHIFT; + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -2772,24 +3388,29 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) error = -EIO; break; } + + if (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio)) + fallback_page_copy = true; } /* * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ - nr = PAGE_SIZE; i_size = i_size_read(inode); - end_index = i_size >> PAGE_SHIFT; - if (index == end_index) { - nr = i_size & ~PAGE_MASK; - if (nr <= offset) { - if (folio) - folio_put(folio); - break; - } + if (unlikely(iocb->ki_pos >= i_size)) { + if (folio) + folio_put(folio); + break; } - nr -= offset; + end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); + if (folio && likely(!fallback_page_copy)) + fsize = folio_size(folio); + else + fsize = PAGE_SIZE; + offset = iocb->ki_pos & (fsize - 1); + nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); if (folio) { /* @@ -2797,10 +3418,15 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + if (mapping_writably_mapped(mapping)) { + if (likely(!fallback_page_copy)) + flush_dcache_folio(folio); + else + flush_dcache_page(page); + } + /* - * Mark the page accessed if we read the beginning. + * Mark the folio accessed if we read the beginning. */ if (!offset) folio_mark_accessed(folio); @@ -2808,9 +3434,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ - ret = copy_page_to_iter(page, offset, nr, to); + if (likely(!fallback_page_copy)) + ret = copy_folio_to_iter(folio, offset, nr, to); + else + ret = copy_page_to_iter(page, offset, nr, to); folio_put(folio); - } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but @@ -2828,9 +3456,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } retval += ret; - offset += ret; - index += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; + iocb->ki_pos += ret; if (!iov_iter_count(to)) break; @@ -2841,7 +3467,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) cond_resched(); } - *ppos = ((loff_t) index << PAGE_SHIFT) + offset; file_accessed(file); return retval ? retval : error; } @@ -2898,7 +3523,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, size = min_t(size_t, size, PAGE_SIZE - offset); - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { @@ -2925,16 +3550,21 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, int error = 0; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); do { + bool fallback_page_splice = false; + struct page *page = NULL; + pgoff_t index; + size_t size; + if (*ppos >= i_size_read(inode)) break; - error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, - SGP_READ); + index = *ppos >> PAGE_SHIFT; + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -2943,12 +3573,15 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (folio) { folio_unlock(folio); - if (folio_test_hwpoison(folio) || - (folio_test_large(folio) && - folio_test_has_hwpoisoned(folio))) { + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { error = -EIO; break; } + + if (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio)) + fallback_page_splice = true; } /* @@ -2962,7 +3595,17 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, isize = i_size_read(inode); if (unlikely(*ppos >= isize)) break; - part = min_t(loff_t, isize - *ppos, len); + /* + * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned + * pages. + */ + size = len; + if (unlikely(fallback_page_splice)) { + size_t offset = *ppos & ~PAGE_MASK; + + size = umin(size, PAGE_SIZE - offset); + } + part = min_t(loff_t, isize - *ppos, size); if (folio) { /* @@ -2970,8 +3613,12 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (mapping_writably_mapped(mapping)) - flush_dcache_folio(folio); + if (mapping_writably_mapped(mapping)) { + if (likely(!fallback_page_splice)) + flush_dcache_folio(folio); + else + flush_dcache_page(page); + } folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so we can @@ -2990,7 +3637,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) break; cond_resched(); @@ -3038,6 +3685,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode_lock(inode); + if (info->flags & SHMEM_F_MAPPING_FROZEN) { + error = -EPERM; + goto out; + } + if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); @@ -3112,16 +3764,19 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, struct folio *folio; /* - * Good, the fallocate(2) manpage permits EINTR: we may have - * been interrupted because we are using up too much memory. + * Check for fatal signal so that we abort early in OOM + * situations. We don't want to abort in case of non-fatal + * signals as large fallocate can take noticeable time and + * e.g. periodic timers may result in fallocate constantly + * restarting. */ - if (signal_pending(current)) + if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_get_folio(inode, index, &folio, - SGP_FALLOC); + error = shmem_get_folio(inode, index, offset + len, + &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ @@ -3144,7 +3799,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, index--; /* - * Inform shmem_writepage() how far we have reached. + * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) @@ -3211,6 +3866,9 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode; int error; + if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) + return -EINVAL; + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -3218,8 +3876,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, error = simple_acl_create(dir, inode); if (error) goto out_iput; - error = security_inode_init_security(inode, dir, - &dentry->d_name, + error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; @@ -3229,10 +3886,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, goto out_iput; dir->i_size += BOGO_DIRENT_SIZE; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + + d_make_persistent(dentry, inode); return error; out_iput: @@ -3248,14 +3905,11 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); - if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; } - - error = security_inode_init_security(inode, dir, - NULL, + error = security_inode_init_security(inode, dir, NULL, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; @@ -3271,16 +3925,16 @@ out_iput: return error; } -static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) - return error; + return ERR_PTR(error); inc_nlink(dir); - return 0; + return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, @@ -3292,10 +3946,11 @@ static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, /* * Link a file.. */ -static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +static int shmem_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - int ret = 0; + int ret; /* * No ordinary (disk based) filesystem counts links as inodes; @@ -3307,26 +3962,19 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr if (inode->i_nlink) { ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) - goto out; + return ret; } ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (ret) { if (inode->i_nlink) shmem_free_inode(inode->i_sb, 0); - goto out; + return ret; } dir->i_size += BOGO_DIRENT_SIZE; - dir->i_mtime = inode_set_ctime_to_ts(dir, - inode_set_ctime_current(inode)); inode_inc_iversion(dir); - inc_nlink(inode); - ihold(inode); /* New dentry reference */ - dget(dentry); /* Extra pinning count for the created dentry */ - d_instantiate(dentry, inode); -out: - return ret; + return simple_link(old_dentry, dir, dentry); } static int shmem_unlink(struct inode *dir, struct dentry *dentry) @@ -3339,11 +3987,16 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) simple_offset_remove(shmem_get_offset_ctx(dir), dentry); dir->i_size -= BOGO_DIRENT_SIZE; - dir->i_mtime = inode_set_ctime_to_ts(dir, - inode_set_ctime_current(inode)); inode_inc_iversion(dir); - drop_nlink(inode); - dput(dentry); /* Undo the count from "create" - this does all the work */ + simple_unlink(dir, dentry); + + /* + * For now, VFS can't deal with case-insensitive negative dentries, so + * we invalidate them + */ + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_invalidate(dentry); + return 0; } @@ -3415,8 +4068,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return error; } - simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry); - error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry); + error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (error) return error; @@ -3446,6 +4098,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, int len; struct inode *inode; struct folio *folio; + char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3453,7 +4106,6 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); - if (IS_ERR(inode)) return PTR_ERR(inode); @@ -3468,18 +4120,19 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - inode->i_link = kmemdup(symname, len, GFP_KERNEL); - if (!inode->i_link) { + link = kmemdup(symname, len, GFP_KERNEL); + if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; + inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); + inode->i_mapping->a_ops = &shmem_aops; + error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; - inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); folio_mark_uptodate(folio); @@ -3488,10 +4141,9 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); - d_instantiate(dentry, inode); - dget(dentry); + d_make_persistent(dentry, inode); return 0; out_remove_offset: @@ -3507,8 +4159,7 @@ static void shmem_put_link(void *arg) folio_put(arg); } -static const char *shmem_get_link(struct dentry *dentry, - struct inode *inode, +static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct folio *folio = NULL; @@ -3524,7 +4175,7 @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); } } else { - error = shmem_get_folio(inode, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) @@ -3542,7 +4193,7 @@ static const char *shmem_get_link(struct dentry *dentry, #ifdef CONFIG_TMPFS_XATTR -static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); @@ -3552,20 +4203,27 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) } static int shmem_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); + int ret, flags; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) return -EOPNOTSUPP; - info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); - shmem_set_inode_flags(inode, info->fsflags); + ret = shmem_set_inode_flags(inode, flags, dentry); + + if (ret) + return ret; + + info->fsflags = flags; + inode_set_ctime_current(inode); inode_inc_iversion(inode); return 0; @@ -3582,8 +4240,7 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap, * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, - const struct xattr *xattr_array, - void *fs_info) + const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); @@ -3714,7 +4371,7 @@ static const struct xattr_handler shmem_user_xattr_handler = { .set = shmem_xattr_handler_set, }; -static const struct xattr_handler *shmem_xattr_handlers[] = { +static const struct xattr_handler * const shmem_xattr_handlers[] = { &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, &shmem_user_xattr_handler, @@ -3767,7 +4424,6 @@ static struct dentry *shmem_find_alias(struct inode *inode) return alias ?: d_find_any_alias(inode); } - static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -3846,6 +4502,9 @@ enum shmem_param { Opt_usrquota_inode_hardlimit, Opt_grpquota_block_hardlimit, Opt_grpquota_inode_hardlimit, + Opt_casefold_version, + Opt_casefold, + Opt_strict_encoding, }; static const struct constant_table shmem_param_enums_huge[] = { @@ -3857,14 +4516,14 @@ static const struct constant_table shmem_param_enums_huge[] = { }; const struct fs_parameter_spec shmem_fs_parameters[] = { - fsparam_u32 ("gid", Opt_gid), + fsparam_gid ("gid", Opt_gid), fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), fsparam_u32oct("mode", Opt_mode), fsparam_string("mpol", Opt_mpol), fsparam_string("nr_blocks", Opt_nr_blocks), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("size", Opt_size), - fsparam_u32 ("uid", Opt_uid), + fsparam_uid ("uid", Opt_uid), fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), fsparam_flag ("noswap", Opt_noswap), @@ -3877,9 +4536,54 @@ const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), #endif + fsparam_string("casefold", Opt_casefold_version), + fsparam_flag ("casefold", Opt_casefold), + fsparam_flag ("strict_encoding", Opt_strict_encoding), {} }; +#if IS_ENABLED(CONFIG_UNICODE) +static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, + bool latest_version) +{ + struct shmem_options *ctx = fc->fs_private; + int version = UTF8_LATEST; + struct unicode_map *encoding; + char *version_str = param->string + 5; + + if (!latest_version) { + if (strncmp(param->string, "utf8-", 5)) + return invalfc(fc, "Only UTF-8 encodings are supported " + "in the format: utf8-<version number>"); + + version = utf8_parse_version(version_str); + if (version < 0) + return invalfc(fc, "Invalid UTF-8 version: %s", version_str); + } + + encoding = utf8_load(version); + + if (IS_ERR(encoding)) { + return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n", + unicode_major(version), unicode_minor(version), + unicode_rev(version)); + } + + pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n", + unicode_major(version), unicode_minor(version), unicode_rev(version)); + + ctx->encoding = encoding; + + return 0; +} +#else +static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, + bool latest_version) +{ + return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); +} +#endif + static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { struct shmem_options *ctx = fc->fs_private; @@ -3924,9 +4628,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->mode = result.uint_32 & 07777; break; case Opt_uid: - kuid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(kuid)) - goto bad_value; + kuid = result.uid; /* * The requested uid must be representable in the @@ -3938,9 +4640,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->uid = kuid; break; case Opt_gid: - kgid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(kgid)) - goto bad_value; + kgid = result.gid; /* * The requested gid must be representable in the @@ -3986,7 +4686,6 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) "Turning off swap in unprivileged tmpfs mounts unsupported"); } ctx->noswap = true; - ctx->seen |= SHMEM_SEEN_NOSWAP; break; case Opt_quota: if (fc->user_ns != &init_user_ns) @@ -4042,6 +4741,17 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) "Group quota inode hardlimit too large."); ctx->qlimits.grpquota_ihardlimit = size; break; + case Opt_casefold_version: + return shmem_parse_opt_casefold(fc, param, false); + case Opt_casefold: + return shmem_parse_opt_casefold(fc, param, true); + case Opt_strict_encoding: +#if IS_ENABLED(CONFIG_UNICODE) + ctx->strict_encoding = true; + break; +#else + return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); +#endif } return 0; @@ -4051,48 +4761,37 @@ bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } -static int shmem_parse_options(struct fs_context *fc, void *data) +static char *shmem_next_opt(char **s) { - char *options = data; + char *sbegin = *s; + char *p; - if (options) { - int err = security_sb_eat_lsm_opts(options, &fc->security); - if (err) - return err; - } + if (sbegin == NULL) + return NULL; - while (options != NULL) { - char *this_char = options; - for (;;) { - /* - * NUL-terminate this option: unfortunately, - * mount options form a comma-separated list, - * but mpol's nodelist may also contain commas. - */ - options = strchr(options, ','); - if (options == NULL) - break; - options++; - if (!isdigit(*options)) { - options[-1] = '\0'; - break; - } - } - if (*this_char) { - char *value = strchr(this_char, '='); - size_t len = 0; - int err; - - if (value) { - *value++ = '\0'; - len = strlen(value); - } - err = vfs_parse_fs_string(fc, this_char, value, len); - if (err < 0) - return err; + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + for (;;) { + p = strchr(*s, ','); + if (p == NULL) + break; + *s = p + 1; + if (!isdigit(*(p+1))) { + *p = '\0'; + return sbegin; } } - return 0; + + *s = NULL; + return sbegin; +} + +static int shmem_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* @@ -4136,14 +4835,15 @@ static int shmem_reconfigure(struct fs_context *fc) err = "Current inum too high to switch to 32-bit inums"; goto out; } - if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) { + + /* + * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap" + * counterpart for (re-)enabling swap. + */ + if (ctx->noswap && !sbinfo->noswap) { err = "Cannot disable swap on remount"; goto out; } - if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) { - err = "Cannot enable swap on remount if it was disabled on first mount"; - goto out; - } if (ctx->seen & SHMEM_SEEN_QUOTA && !sb_any_quota_loaded(fc->root->d_sb)) { @@ -4244,6 +4944,24 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) mpol_put(mpol); if (sbinfo->noswap) seq_printf(seq, ",noswap"); +#ifdef CONFIG_TMPFS_QUOTA + if (sb_has_quota_active(root->d_sb, USRQUOTA)) + seq_printf(seq, ",usrquota"); + if (sb_has_quota_active(root->d_sb, GRPQUOTA)) + seq_printf(seq, ",grpquota"); + if (sbinfo->qlimits.usrquota_bhardlimit) + seq_printf(seq, ",usrquota_block_hardlimit=%lld", + sbinfo->qlimits.usrquota_bhardlimit); + if (sbinfo->qlimits.grpquota_bhardlimit) + seq_printf(seq, ",grpquota_block_hardlimit=%lld", + sbinfo->qlimits.grpquota_bhardlimit); + if (sbinfo->qlimits.usrquota_ihardlimit) + seq_printf(seq, ",usrquota_inode_hardlimit=%lld", + sbinfo->qlimits.usrquota_ihardlimit); + if (sbinfo->qlimits.grpquota_ihardlimit) + seq_printf(seq, ",grpquota_inode_hardlimit=%lld", + sbinfo->qlimits.grpquota_ihardlimit); +#endif return 0; } @@ -4253,6 +4971,11 @@ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); +#if IS_ENABLED(CONFIG_UNICODE) + if (sb->s_encoding) + utf8_unload(sb->s_encoding); +#endif + #ifdef CONFIG_TMPFS_QUOTA shmem_disable_quotas(sb); #endif @@ -4263,6 +4986,13 @@ static void shmem_put_super(struct super_block *sb) sb->s_fs_info = NULL; } +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS) +static const struct dentry_operations shmem_ci_dentry_ops = { + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, +}; +#endif + static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; @@ -4296,10 +5026,27 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC | SB_I_VERSION; + sb->s_flags |= SB_NOSEC; + +#if IS_ENABLED(CONFIG_UNICODE) + if (!ctx->encoding && ctx->strict_encoding) { + pr_err("tmpfs: strict_encoding option without encoding is forbidden\n"); + error = -EINVAL; + goto failed; + } + + if (ctx->encoding) { + sb->s_encoding = ctx->encoding; + set_default_d_op(sb, &shmem_ci_dentry_ops); + if (ctx->strict_encoding) + sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; + } +#endif + #else sb->s_flags |= SB_NOUSER; -#endif +#endif /* CONFIG_TMPFS */ + sb->s_d_flags |= DCACHE_DONTCACHE; sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; @@ -4312,7 +5059,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; - sbinfo->huge = ctx->huge; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + else + sbinfo->huge = tmpfs_huge; +#endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; @@ -4334,7 +5086,9 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif - uuid_gen(&sb->s_uuid); + uuid_t uuid; + uuid_gen(&uuid); + super_set_uuid(sb, uuid.b, sizeof(uuid)); #ifdef CONFIG_TMPFS_QUOTA if (ctx->seen & SHMEM_SEEN_QUOTA) { @@ -4351,8 +5105,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) } #endif /* CONFIG_TMPFS_QUOTA */ - inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, - VM_NORESERVE); + inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, + S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; @@ -4388,13 +5142,13 @@ static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS - .parse_monolithic = shmem_parse_options, + .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif }; -static struct kmem_cache *shmem_inode_cachep; +static struct kmem_cache *shmem_inode_cachep __ro_after_init; static struct inode *shmem_alloc_inode(struct super_block *sb) { @@ -4426,27 +5180,26 @@ static void shmem_init_inode(void *foo) inode_init_once(&info->vfs_inode); } -static void shmem_init_inodecache(void) +static void __init shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); } -static void shmem_destroy_inodecache(void) +static void __init shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } /* Keep the page in page cache instead of truncating it */ -static int shmem_error_remove_page(struct address_space *mapping, - struct page *page) +static int shmem_error_remove_folio(struct address_space *mapping, + struct folio *folio) { return 0; } -const struct address_space_operations shmem_aops = { - .writepage = shmem_writepage, +static const struct address_space_operations shmem_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, @@ -4455,12 +5208,11 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif - .error_remove_page = shmem_error_remove_page, + .error_remove_folio = shmem_error_remove_folio, }; -EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { - .mmap = shmem_mmap, + .mmap_prepare = shmem_mmap_prepare, .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS @@ -4534,7 +5286,7 @@ static const struct super_operations shmem_ops = { .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, @@ -4572,8 +5324,15 @@ int shmem_init_fs_context(struct fs_context *fc) ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); +#if IS_ENABLED(CONFIG_UNICODE) + ctx->encoding = NULL; +#endif + fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; +#ifdef CONFIG_TMPFS + fc->sb_flags |= SB_I_VERSION; +#endif return 0; } @@ -4584,14 +5343,70 @@ static struct file_system_type shmem_fs_type = { #ifdef CONFIG_TMPFS .parameters = shmem_fs_parameters, #endif - .kill_sb = kill_litter_super, -#ifdef CONFIG_SHMEM + .kill_sb = kill_anon_super, .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, -#else - .fs_flags = FS_USERNS_MOUNT, +}; + +#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define TMPFS_ATTR_W(_name, _store) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + +#define TMPFS_ATTR_RW(_name, _show, _store) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define TMPFS_ATTR_RO(_name, _show) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#if IS_ENABLED(CONFIG_UNICODE) +static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a, + char *buf) +{ + return sysfs_emit(buf, "supported\n"); +} +TMPFS_ATTR_RO(casefold, casefold_show); +#endif + +static struct attribute *tmpfs_attributes[] = { +#if IS_ENABLED(CONFIG_UNICODE) + &tmpfs_attr_casefold.attr, #endif + NULL +}; + +static const struct attribute_group tmpfs_attribute_group = { + .attrs = tmpfs_attributes, + .name = "features" }; +static struct kobject *tmpfs_kobj; + +static int __init tmpfs_sysfs_init(void) +{ + int ret; + + tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj); + if (!tmpfs_kobj) + return -ENOMEM; + + ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group); + if (ret) + kobject_put(tmpfs_kobj); + + return ret; +} +#endif /* CONFIG_SYSFS && CONFIG_TMPFS */ + void __init shmem_init(void) { int error; @@ -4599,11 +5414,7 @@ void __init shmem_init(void) shmem_init_inodecache(); #ifdef CONFIG_TMPFS_QUOTA - error = register_quota_format(&shmem_quota_format); - if (error < 0) { - pr_err("Could not register quota format\n"); - goto out3; - } + register_quota_format(&shmem_quota_format); #endif error = register_filesystem(&shmem_fs_type); @@ -4619,11 +5430,26 @@ void __init shmem_init(void) goto out1; } +#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) + error = tmpfs_sysfs_init(); + if (error) { + pr_err("Could not init tmpfs sysfs\n"); + goto out1; + } +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ + + /* + * Default to setting PMD-sized THP to inherit the global setting and + * disable all other multi-size THPs. + */ + if (!shmem_orders_configured) + huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; @@ -4632,7 +5458,6 @@ out1: out2: #ifdef CONFIG_TMPFS_QUOTA unregister_quota_format(&shmem_quota_format); -out3: #endif shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); @@ -4655,11 +5480,9 @@ static ssize_t shmem_enabled_show(struct kobject *kobj, for (i = 0; i < ARRAY_SIZE(values); i++) { len += sysfs_emit_at(buf, len, - shmem_huge == values[i] ? "%s[%s]" : "%s%s", - i ? " " : "", - shmem_format_huge(values[i])); + shmem_huge == values[i] ? "%s[%s]" : "%s%s", + i ? " " : "", shmem_format_huge(values[i])); } - len += sysfs_emit_at(buf, len, "\n"); return len; @@ -4669,7 +5492,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { char tmp[16]; - int huge; + int huge, err; if (count + 1 > sizeof(tmp)) return -EINVAL; @@ -4680,20 +5503,238 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, huge = shmem_parse_huge(tmp); if (huge == -EINVAL) - return -EINVAL; - if (!has_transparent_hugepage() && - huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) - return -EINVAL; + return huge; shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; - return count; + + err = start_stop_khugepaged(); + return err ? err : count; } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); +static DEFINE_SPINLOCK(huge_shmem_orders_lock); + +static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_shmem_orders_always)) + output = "[always] inherit within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_inherit)) + output = "always [inherit] within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_within_size)) + output = "always inherit [within_size] advise never"; + else if (test_bit(order, &huge_shmem_orders_madvise)) + output = "always inherit within_size [advise] never"; + else + output = "always inherit within_size advise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_always); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "inherit")) { + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (shmem_huge == SHMEM_HUGE_FORCE && + order != HPAGE_PMD_ORDER) + return -EINVAL; + + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_inherit); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "within_size")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + set_bit(order, &huge_shmem_orders_within_size); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "advise")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + clear_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else { + ret = -EINVAL; + } + + if (ret > 0) { + int err = start_stop_khugepaged(); + + if (err) + ret = err; + } + return ret; +} + +struct kobj_attribute thpsize_shmem_enabled_attr = + __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) + +static int __init setup_transparent_hugepage_shmem(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge == -EINVAL) { + pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n"); + return huge; + } + + shmem_huge = huge; + return 1; +} +__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); + +static int __init setup_transparent_hugepage_tmpfs(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge < 0) { + pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); + return huge; + } + + tmpfs_huge = huge; + return 1; +} +__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); + +static char str_dup[PAGE_SIZE] __initdata; +static int __init setup_thp_shmem(char *str) +{ + char *token, *range, *policy, *subtoken; + unsigned long always, inherit, madvise, within_size; + char *start_size, *end_size; + int start, end, nr; + char *p; + + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strscpy(str_dup, str); + + always = huge_shmem_orders_always; + inherit = huge_shmem_orders_inherit; + madvise = huge_shmem_orders_madvise; + within_size = huge_shmem_orders_within_size; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; + + if (!policy) + goto err; + + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; + + start = get_order_from_str(start_size, + THP_ORDERS_ALL_FILE_DEFAULT); + end = get_order_from_str(end_size, + THP_ORDERS_ALL_FILE_DEFAULT); + } else { + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_FILE_DEFAULT); + } + + if (start < 0) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + start_size); + goto err; + } + + if (end < 0) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + end_size); + goto err; + } + + if (start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "advise")) { + bitmap_set(&madvise, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "inherit")) { + bitmap_set(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "within_size")) { + bitmap_set(&within_size, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + } else if (!strcmp(policy, "never")) { + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else { + pr_err("invalid policy %s in thp_shmem boot parameter\n", policy); + goto err; + } + } + } + + huge_shmem_orders_always = always; + huge_shmem_orders_madvise = madvise; + huge_shmem_orders_inherit = inherit; + huge_shmem_orders_within_size = within_size; + shmem_orders_configured = true; + return 1; + +err: + pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str); + return 0; +} +__setup("thp_shmem=", setup_thp_shmem); + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #else /* !CONFIG_SHMEM */ /* @@ -4740,11 +5781,11 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #endif -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } @@ -4756,8 +5797,9 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) -static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); @@ -4767,9 +5809,11 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct supe /* common code */ -static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, - unsigned long flags, unsigned int i_flags) +static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, + loff_t size, unsigned long vm_flags, + unsigned int i_flags) { + unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; @@ -4779,15 +5823,14 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); - if (shmem_acct_size(flags, size)) - return ERR_PTR(-ENOMEM); - if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); - inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, - S_IFREG | S_IRWXUGO, 0, flags); + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, + S_IFREG | S_IRWXUGO, 0, vm_flags); if (IS_ERR(inode)) { shmem_unacct_size(flags, size); return ERR_CAST(inode); @@ -4810,7 +5853,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -4818,10 +5861,11 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } +EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -4834,7 +5878,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -4845,14 +5889,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); -/** - * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap - */ -int shmem_zero_setup(struct vm_area_struct *vma) +static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags) { - struct file *file; - loff_t size = vma->vm_end - vma->vm_start; + loff_t size = end - start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict @@ -4860,7 +5899,18 @@ int shmem_zero_setup(struct vm_area_struct *vma) * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); + return shmem_kernel_file_setup("dev/zero", size, vm_flags); +} + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap + * Returns: 0 on success, or error + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags); + if (IS_ERR(file)) return PTR_ERR(file); @@ -4873,6 +5923,25 @@ int shmem_zero_setup(struct vm_area_struct *vma) } /** + * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA + * descriptor for convenience. + * @desc: Describes VMA + * Returns: 0 on success, or error + */ +int shmem_zero_setup_desc(struct vm_area_desc *desc) +{ + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags); + + if (IS_ERR(file)) + return PTR_ERR(file); + + desc->vm_file = file; + desc->vm_ops = &shmem_anon_vm_ops; + + return 0; +} + +/** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space * @index: the folio index @@ -4895,9 +5964,8 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - BUG_ON(!shmem_mapping(mapping)); - error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, - gfp, NULL, NULL, NULL); + error = shmem_get_folio_gfp(inode, index, i_size_read(inode), + &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); |
