diff options
Diffstat (limited to 'mm/shmem.c')
| -rw-r--r-- | mm/shmem.c | 3731 |
1 files changed, 2690 insertions, 1041 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 0005ab2c29af..3f194c9842a8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -33,15 +33,17 @@ #include <linux/random.h> #include <linux/sched/signal.h> #include <linux/export.h> +#include <linux/shmem_fs.h> #include <linux/swap.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/fs_parser.h> #include <linux/swapfile.h> #include <linux/iversion.h> +#include <linux/unicode.h> #include "swap.h" -static struct vfsmount *shm_mnt; +static struct vfsmount *shm_mnt __ro_after_init; #ifdef CONFIG_SHMEM /* @@ -58,14 +60,13 @@ static struct vfsmount *shm_mnt; #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> -#include <linux/shmem_fs.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/percpu_counter.h> #include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/mempolicy.h> #include <linux/namei.h> #include <linux/ctype.h> @@ -76,25 +77,28 @@ static struct vfsmount *shm_mnt; #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> -#include <linux/userfaultfd_k.h> #include <linux/rmap.h> #include <linux/uuid.h> +#include <linux/quotaops.h> +#include <linux/rcupdate_wait.h> #include <linux/uaccess.h> #include "internal.h" -#define BLOCKS_PER_PAGE (PAGE_SIZE/512) #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 +/* Pretend that one inode + its dentry occupy this much memory */ +#define BOGO_INODE_SIZE 1024 + /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ @@ -103,7 +107,7 @@ struct shmem_falloc { pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ - pgoff_t nr_unswapped; /* how often writepage refused to swap out */ + pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { @@ -116,12 +120,28 @@ struct shmem_options { bool full_inums; int huge; int seen; + bool noswap; + unsigned short quota_types; + struct shmem_quota_limits qlimits; +#if IS_ENABLED(CONFIG_UNICODE) + struct unicode_map *encoding; + bool strict_encoding; +#endif #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 +#define SHMEM_SEEN_QUOTA 16 }; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long huge_shmem_orders_always __read_mostly; +static unsigned long huge_shmem_orders_madvise __read_mostly; +static unsigned long huge_shmem_orders_inherit __read_mostly; +static unsigned long huge_shmem_orders_within_size __read_mostly; +static bool shmem_orders_configured __initdata; +#endif + #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -132,14 +152,14 @@ static unsigned long shmem_default_max_inodes(void) { unsigned long nr_pages = totalram_pages(); - return min(nr_pages - totalhigh_pages(), nr_pages / 2); + return min3(nr_pages - totalhigh_pages(), nr_pages / 2, + ULONG_MAX / BOGO_INODE_SIZE); } #endif static int shmem_swapin_folio(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - vm_fault_t *fault_type); + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -154,20 +174,20 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { - return (flags & VM_NORESERVE) ? + return (flags & SHMEM_F_NORESERVE) ? 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { - if (!(flags & VM_NORESERVE)) + if (!(flags & SHMEM_F_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } static inline int shmem_reacct_size(unsigned long flags, loff_t oldsize, loff_t newsize) { - if (!(flags & VM_NORESERVE)) { + if (!(flags & SHMEM_F_NORESERVE)) { if (VM_ACCT(newsize) > VM_ACCT(oldsize)) return security_vm_enough_memory_mm(current->mm, VM_ACCT(newsize) - VM_ACCT(oldsize)); @@ -180,12 +200,12 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. - * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ -static inline int shmem_acct_block(unsigned long flags, long pages) +static inline int shmem_acct_blocks(unsigned long flags, long pages) { - if (!(flags & VM_NORESERVE)) + if (!(flags & SHMEM_F_NORESERVE)) return 0; return security_vm_enough_memory_mm(current->mm, @@ -194,44 +214,58 @@ static inline int shmem_acct_block(unsigned long flags, long pages) static inline void shmem_unacct_blocks(unsigned long flags, long pages) { - if (flags & VM_NORESERVE) + if (flags & SHMEM_F_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } -static inline bool shmem_inode_acct_block(struct inode *inode, long pages) +int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + int err = -ENOSPC; - if (shmem_acct_block(info->flags, pages)) - return false; + if (shmem_acct_blocks(info->flags, pages)) + return err; + might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - pages) > 0) + if (!percpu_counter_limited_add(&sbinfo->used_blocks, + sbinfo->max_blocks, pages)) + goto unacct; + + err = dquot_alloc_block_nodirty(inode, pages); + if (err) { + percpu_counter_sub(&sbinfo->used_blocks, pages); + goto unacct; + } + } else { + err = dquot_alloc_block_nodirty(inode, pages); + if (err) goto unacct; - percpu_counter_add(&sbinfo->used_blocks, pages); } - return true; + return 0; unacct: shmem_unacct_blocks(info->flags, pages); - return false; + return err; } -static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) +static void shmem_inode_unacct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + might_sleep(); /* when quotas */ + dquot_free_block_nodirty(inode, pages); + if (sbinfo->max_blocks) percpu_counter_sub(&sbinfo->used_blocks, pages); shmem_unacct_blocks(info->flags, pages); } static const struct super_operations shmem_ops; -const struct address_space_operations shmem_aops; +static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; @@ -240,18 +274,65 @@ static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; -bool vma_is_anon_shmem(struct vm_area_struct *vma) +bool shmem_mapping(const struct address_space *mapping) +{ + return mapping->a_ops == &shmem_aops; +} +EXPORT_SYMBOL_GPL(shmem_mapping); + +bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } -bool vma_is_shmem(struct vm_area_struct *vma) +bool vma_is_shmem(const struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); -static DEFINE_MUTEX(shmem_swaplist_mutex); +static DEFINE_SPINLOCK(shmem_swaplist_lock); + +#ifdef CONFIG_TMPFS_QUOTA + +static int shmem_enable_quotas(struct super_block *sb, + unsigned short quota_types) +{ + int type, err = 0; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; + for (type = 0; type < SHMEM_MAXQUOTAS; type++) { + if (!(quota_types & (1 << type))) + continue; + err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, + DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); + if (err) + goto out_err; + } + return 0; + +out_err: + pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", + type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + return err; +} + +static void shmem_disable_quotas(struct super_block *sb) +{ + int type; + + for (type = 0; type < SHMEM_MAXQUOTAS; type++) + dquot_quota_off(sb, type); +} + +static struct dquot __rcu **shmem_get_dquots(struct inode *inode) +{ + return SHMEM_I(inode)->i_dquot; +} +#endif /* CONFIG_TMPFS_QUOTA */ /* * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and @@ -271,11 +352,11 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) if (!(sb->s_flags & SB_KERNMOUNT)) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { - if (!sbinfo->free_inodes) { + if (sbinfo->free_ispace < BOGO_INODE_SIZE) { raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } - sbinfo->free_inodes--; + sbinfo->free_ispace -= BOGO_INODE_SIZE; } if (inop) { ino = sbinfo->next_ino++; @@ -329,12 +410,12 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) return 0; } -static void shmem_free_inode(struct super_block *sb) +static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { raw_spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; + sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; raw_spin_unlock(&sbinfo->stat_lock); } } @@ -342,6 +423,8 @@ static void shmem_free_inode(struct super_block *sb) /** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc + * @alloced: the change in number of pages allocated to inode + * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. @@ -349,55 +432,63 @@ static void shmem_free_inode(struct super_block *sb) * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) * - * It has to be called with the spinlock held. + * Return: true if swapped was incremented from 0, for shmem_writeout(). */ -static void shmem_recalc_inode(struct inode *inode) +bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); + bool first_swapped = false; long freed; - freed = info->alloced - info->swapped - inode->i_mapping->nrpages; - if (freed > 0) { + spin_lock(&info->lock); + info->alloced += alloced; + info->swapped += swapped; + freed = info->alloced - info->swapped - + READ_ONCE(inode->i_mapping->nrpages); + /* + * Special case: whereas normally shmem_recalc_inode() is called + * after i_mapping->nrpages has already been adjusted (up or down), + * shmem_writeout() has to raise swapped before nrpages is lowered - + * to stop a racing shmem_recalc_inode() from thinking that a page has + * been freed. Compensate here, to avoid the need for a followup call. + */ + if (swapped > 0) { + if (info->swapped == swapped) + first_swapped = true; + freed += swapped; + } + if (freed > 0) info->alloced -= freed; - inode->i_blocks -= freed * BLOCKS_PER_PAGE; + spin_unlock(&info->lock); + + /* The quota case may block */ + if (freed > 0) shmem_inode_unacct_blocks(inode, freed); - } + return first_swapped; } bool shmem_charge(struct inode *inode, long pages) { - struct shmem_inode_info *info = SHMEM_I(inode); - unsigned long flags; + struct address_space *mapping = inode->i_mapping; - if (!shmem_inode_acct_block(inode, pages)) + if (shmem_inode_acct_blocks(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ - inode->i_mapping->nrpages += pages; - - spin_lock_irqsave(&info->lock, flags); - info->alloced += pages; - inode->i_blocks += pages * BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock_irqrestore(&info->lock, flags); + xa_lock_irq(&mapping->i_pages); + mapping->nrpages += pages; + xa_unlock_irq(&mapping->i_pages); + shmem_recalc_inode(inode, pages, 0); return true; } void shmem_uncharge(struct inode *inode, long pages) { - struct shmem_inode_info *info = SHMEM_I(inode); - unsigned long flags; - + /* pages argument is currently unused: keep it to help debugging */ /* nrpages adjustment done by __filemap_remove_folio() or caller */ - spin_lock_irqsave(&info->lock, flags); - info->alloced -= pages; - inode->i_blocks -= pages * BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock_irqrestore(&info->lock, flags); - - shmem_inode_unacct_blocks(inode, pages); + shmem_recalc_inode(inode, 0, 0); } /* @@ -420,15 +511,27 @@ static int shmem_replace_entry(struct address_space *mapping, /* * Sometimes, before we decide whether to proceed or to fail, we must check - * that an entry was not already brought back from swap by a racing thread. + * that an entry was not already brought back or split by a racing thread. * - * Checking page is not enough: by the time a SwapCache page is locked, it - * might be reused, and again be SwapCache, using the same swap as before. + * Checking folio is not enough: by the time a swapcache folio is locked, it + * might be reused, and again be swapcache, using the same swap as before. + * Returns the swap entry's order if it still presents, else returns -1. */ -static bool shmem_confirm_swap(struct address_space *mapping, - pgoff_t index, swp_entry_t swap) +static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, + swp_entry_t swap) { - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); + XA_STATE(xas, &mapping->i_pages, index); + int ret = -1; + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + if (entry == swp_to_radix_entry(swap)) + ret = xas_get_order(&xas); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + return ret; } /* @@ -440,9 +543,9 @@ static bool shmem_confirm_swap(struct address_space *mapping, * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, - * also respect fadvise()/madvise() hints; + * also respect madvise() hints; * SHMEM_HUGE_ADVISE: - * only allocate huge pages if requested with fadvise()/madvise(); + * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 @@ -466,59 +569,137 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ -static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; +#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE +#else +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER +#endif + +static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT; + +#undef SHMEM_HUGE_DEFAULT + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE +#else +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER +#endif + +static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT; + +#undef TMPFS_HUGE_DEFAULT -bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force) +static unsigned int shmem_get_orders_within_size(struct inode *inode, + unsigned long within_size_orders, pgoff_t index, + loff_t write_end) { + pgoff_t aligned_index; + unsigned long order; loff_t i_size; + order = highest_order(within_size_orders); + while (within_size_orders) { + aligned_index = round_up(index + 1, 1 << order); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= aligned_index) + return within_size_orders; + + order = next_order(&within_size_orders, order); + } + + return 0; +} + +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? + 0 : BIT(HPAGE_PMD_ORDER); + unsigned long within_size_orders; + if (!S_ISREG(inode->i_mode)) - return false; - if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) - return false; + return 0; if (shmem_huge == SHMEM_HUGE_DENY) - return false; + return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) - return true; + return maybe_pmd_order; + /* + * The huge order allocation for anon shmem is controlled through + * the mTHP interface, so we still use PMD-sized huge order to + * check whether global control is enabled. + * + * For tmpfs with 'huge=always' or 'huge=within_size' mount option, + * we will always try PMD-sized order first. If that failed, it will + * fall back to small large folios. + */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: - return true; + return THP_ORDERS_ALL_FILE_DEFAULT; case SHMEM_HUGE_WITHIN_SIZE: - index = round_up(index + 1, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) - return true; + within_size_orders = shmem_get_orders_within_size(inode, + THP_ORDERS_ALL_FILE_DEFAULT, index, write_end); + if (within_size_orders > 0) + return within_size_orders; + fallthrough; case SHMEM_HUGE_ADVISE: - if (vma && (vma->vm_flags & VM_HUGEPAGE)) - return true; + if (vm_flags & VM_HUGEPAGE) + return THP_ORDERS_ALL_FILE_DEFAULT; fallthrough; default: - return false; + return 0; } } -#if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { + int huge; + + if (!str) + return -EINVAL; + if (!strcmp(str, "never")) - return SHMEM_HUGE_NEVER; - if (!strcmp(str, "always")) - return SHMEM_HUGE_ALWAYS; - if (!strcmp(str, "within_size")) - return SHMEM_HUGE_WITHIN_SIZE; - if (!strcmp(str, "advise")) - return SHMEM_HUGE_ADVISE; - if (!strcmp(str, "deny")) - return SHMEM_HUGE_DENY; - if (!strcmp(str, "force")) - return SHMEM_HUGE_FORCE; - return -EINVAL; + huge = SHMEM_HUGE_NEVER; + else if (!strcmp(str, "always")) + huge = SHMEM_HUGE_ALWAYS; + else if (!strcmp(str, "within_size")) + huge = SHMEM_HUGE_WITHIN_SIZE; + else if (!strcmp(str, "advise")) + huge = SHMEM_HUGE_ADVISE; + else if (!strcmp(str, "deny")) + huge = SHMEM_HUGE_DENY; + else if (!strcmp(str, "force")) + huge = SHMEM_HUGE_FORCE; + else + return -EINVAL; + + if (!has_transparent_hugepage() && + huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) + return -EINVAL; + + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (huge == SHMEM_HUGE_FORCE && + huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) + return -EINVAL; + + return huge; } -#endif #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) @@ -544,15 +725,14 @@ static const char *shmem_format_huge(int huge) #endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) + struct shrink_control *sc, unsigned long nr_to_free) { LIST_HEAD(list), *pos, *next; - LIST_HEAD(to_remove); struct inode *inode; struct shmem_inode_info *info; struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; - int split = 0; + unsigned long split = 0, freed = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -570,13 +750,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, goto next; } - /* Check if there's anything to gain */ - if (round_up(inode->i_size, PAGE_SIZE) == - round_up(inode->i_size, HPAGE_PMD_SIZE)) { - list_move(&info->shrinklist, &to_remove); - goto next; - } - list_move(&info->shrinklist, &list); next: sbinfo->shrinklist_len--; @@ -585,34 +758,36 @@ next: } spin_unlock(&sbinfo->shrinklist_lock); - list_for_each_safe(pos, next, &to_remove) { - info = list_entry(pos, struct shmem_inode_info, shrinklist); - inode = &info->vfs_inode; - list_del_init(&info->shrinklist); - iput(inode); - } - list_for_each_safe(pos, next, &list) { + pgoff_t next, end; + loff_t i_size; int ret; - pgoff_t index; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; - if (nr_to_split && split >= nr_to_split) + if (nr_to_free && freed >= nr_to_free) goto move_back; - index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; - folio = filemap_get_folio(inode->i_mapping, index); - if (!folio) + i_size = i_size_read(inode); + folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); + if (!folio || xa_is_value(folio)) goto drop; - /* No huge page at the end of the file: nothing to split */ + /* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) { folio_put(folio); goto drop; } + /* Check if there is anything to gain from splitting */ + next = folio_next_index(folio); + end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); + if (end <= folio->index || end >= next) { + folio_put(folio); + goto drop; + } + /* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. @@ -633,6 +808,7 @@ next: if (ret) goto move_back; + freed += next - end; split++; drop: list_del_init(&info->shrinklist); @@ -676,90 +852,93 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force) +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_free) { - return false; + return 0; } -static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + vm_flags_t vm_flags) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static void shmem_update_stats(struct folio *folio, int nr_pages) +{ + if (folio_test_pmd_mappable(folio)) + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); +} + /* - * Like filemap_add_folio, but error if expected item has gone. + * Somewhat like filemap_add_folio, but error if expected item has gone. */ -static int shmem_add_to_page_cache(struct folio *folio, - struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp, - struct mm_struct *charge_mm) +int shmem_add_to_page_cache(struct folio *folio, + struct address_space *mapping, + pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); - long nr = folio_nr_pages(folio); - int error; + unsigned long nr = folio_nr_pages(folio); + swp_entry_t iter, swap; + void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - VM_BUG_ON(expected && folio_test_large(folio)); folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = index; - if (!folio_test_swapcache(folio)) { - error = mem_cgroup_charge(folio, charge_mm, gfp); - if (error) { - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto error; - } - } + gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); + swap = radix_to_swp_entry(expected); do { + iter = swap; xas_lock_irq(&xas); - if (expected != xas_find_conflict(&xas)) { - xas_set_err(&xas, -EEXIST); - goto unlock; + xas_for_each_conflict(&xas, entry) { + /* + * The range must either be empty, or filled with + * expected swap entries. Shmem swap entries are never + * partially freed without split of both entry and + * folio, so there shouldn't be any holes. + */ + if (!expected || entry != swp_to_radix_entry(iter)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + iter.val += 1 << xas_get_order(&xas); } - if (expected && xas_find_conflict(&xas)) { + if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_ALLOC); - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - } + shmem_update_stats(folio, nr); mapping->nrpages += nr; - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - error = xas_error(&xas); - goto error; + folio->mapping = NULL; + folio_ref_sub(folio, nr); + return xas_error(&xas); } return 0; -error: - folio->mapping = NULL; - folio_ref_sub(folio, nr); - return error; } /* - * Like delete_from_page_cache, but substitutes swap for @folio. + * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { @@ -771,26 +950,29 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); - folio_put(folio); + folio_put_refs(folio, nr); BUG_ON(error); } /* - * Remove swap entry from page cache, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. Returns + * the number of pages being freed. 0 means entry not found in XArray (0 pages + * being freed). */ -static int shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) +static long shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) { + int order = xa_get_order(&mapping->i_pages, index); void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) - return -ENOENT; - free_swap_and_cache(radix_to_swp_entry(radswap)); - return 0; + return 0; + free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + + return 1 << order; } /* @@ -804,22 +986,23 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; unsigned long swapped = 0; + unsigned long max = end - 1; rcu_read_lock(); - xas_for_each(&xas, page, end - 1) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) - swapped++; - + if (xa_is_value(folio)) + swapped += 1 << xas_get_order(&xas); + if (xas.xa_index == max) + break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } - rcu_read_unlock(); return swapped << PAGE_SHIFT; @@ -884,18 +1067,25 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) /* * At first avoid shmem_get_folio(,,,SGP_READ): that fails - * beyond i_size, and reports fallocated pages as holes. + * beyond i_size, and reports fallocated folios as holes. */ - folio = __filemap_get_folio(inode->i_mapping, index, - FGP_ENTRY | FGP_LOCK, 0); - if (!xa_is_value(folio)) + folio = filemap_get_entry(inode->i_mapping, index); + if (!folio) return folio; + if (!xa_is_value(folio)) { + folio_lock(folio); + if (folio->mapping == inode->i_mapping) + return folio; + /* The folio has been swapped out */ + folio_unlock(folio); + folio_put(folio); + } /* - * But read a page back from swap if any of it is within i_size + * But read a folio back from swap if any of it is within i_size * (although in some cases this is just a waste of time). */ folio = NULL; - shmem_get_folio(inode, index, &folio, SGP_READ); + shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } @@ -903,7 +1093,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, +static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; @@ -934,7 +1124,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, + nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } @@ -960,10 +1150,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { - same_folio = lend < folio_pos(folio) + folio_size(folio); + same_folio = lend < folio_next_pos(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { - start = folio->index + folio_nr_pages(folio); + start = folio_next_index(folio); if (same_folio) end = folio->index; } @@ -1001,14 +1191,17 @@ whole_folios: folio = fbatch.folios[i]; if (xa_is_value(folio)) { + long swaps_freed; + if (unfalloc) continue; - if (shmem_free_swap(mapping, indices[i], folio)) { + swaps_freed = shmem_free_swap(mapping, indices[i], folio); + if (!swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } - nr_swaps_freed++; + nr_swaps_freed += swaps_freed; continue; } @@ -1023,7 +1216,24 @@ whole_folios: } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); - truncate_inode_folio(mapping, folio); + + if (!folio_test_large(folio)) { + truncate_inode_folio(mapping, folio); + } else if (truncate_inode_partial_folio(folio, lstart, lend)) { + /* + * If we split a page, reset the loop so + * that we pick up the new sub pages. + * Otherwise the THP was entirely + * dropped or the target range was + * zeroed, so just continue the loop as + * is. + */ + if (!folio_test_large(folio)) { + folio_unlock(folio); + index = start; + break; + } + } } folio_unlock(folio); } @@ -1031,32 +1241,27 @@ whole_folios: folio_batch_release(&fbatch); } - spin_lock_irq(&info->lock); - info->swapped -= nr_swaps_freed; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, -nr_swaps_freed); } -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { shmem_undo_range(inode, lstart, lend, false); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); -static int shmem_getattr(struct user_namespace *mnt_userns, +static int shmem_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); - if (info->alloced - info->swapped != inode->i_mapping->nrpages) { - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - } + if (info->alloced - info->swapped != inode->i_mapping->nrpages) + shmem_recalc_inode(inode, 0, 0); + if (info->fsflags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (info->fsflags & FS_IMMUTABLE_FL) @@ -1066,9 +1271,9 @@ static int shmem_getattr(struct user_namespace *mnt_userns, stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_is_huge(NULL, inode, 0, false)) + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1080,7 +1285,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, return 0; } -static int shmem_setattr(struct user_namespace *mnt_userns, +static int shmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -1089,10 +1294,16 @@ static int shmem_setattr(struct user_namespace *mnt_userns, bool update_mtime = false; bool update_ctime = true; - error = setattr_prepare(&init_user_ns, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); if (error) return error; + if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { + if ((inode->i_mode ^ attr->ia_mode) & 0111) { + return -EPERM; + } + } + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; @@ -1103,6 +1314,8 @@ static int shmem_setattr(struct user_namespace *mnt_userns, return -EPERM; if (newsize != oldsize) { + if (info->flags & SHMEM_F_MAPPING_FROZEN) + return -EPERM; error = shmem_reacct_size(SHMEM_I(inode)->flags, oldsize, newsize); if (error) @@ -1127,13 +1340,27 @@ static int shmem_setattr(struct user_namespace *mnt_userns, } } - setattr_copy(&init_user_ns, inode, attr); + if (is_quota_modification(idmap, inode, attr)) { + error = dquot_initialize(inode); + if (error) + return error; + } + + /* Transfer quota accounting */ + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { + error = dquot_transfer(idmap, inode, attr); + if (error) + return error; + } + + setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); + error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (!error && update_ctime) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (update_mtime) - inode->i_mtime = inode->i_ctime; + inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); inode_inc_iversion(inode); } return error; @@ -1143,6 +1370,7 @@ static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + size_t freed = 0; if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); @@ -1161,23 +1389,27 @@ static void shmem_evict_inode(struct inode *inode) /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); } } - simple_xattrs_free(&info->xattrs); + simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); + shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); - shmem_free_inode(inode->i_sb); clear_inode(inode); +#ifdef CONFIG_TMPFS_QUOTA + dquot_free_inode(inode); + dquot_drop(inode); +#endif } -static int shmem_find_swap_entries(struct address_space *mapping, - pgoff_t start, struct folio_batch *fbatch, - pgoff_t *indices, unsigned int type) +static unsigned int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, struct folio_batch *fbatch, + pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; @@ -1210,7 +1442,7 @@ static int shmem_find_swap_entries(struct address_space *mapping, } rcu_read_unlock(); - return xas.xa_index; + return folio_batch_count(fbatch); } /* @@ -1228,12 +1460,8 @@ static int shmem_unuse_swap_entries(struct inode *inode, for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - if (!xa_is_value(folio)) - continue; - error = shmem_swapin_folio(inode, indices[i], - &folio, SGP_CACHE, - mapping_gfp_mask(mapping), - NULL, NULL); + error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, + mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { folio_unlock(folio); folio_put(folio); @@ -1259,8 +1487,8 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type) do { folio_batch_init(&fbatch); - shmem_find_swap_entries(mapping, start, &fbatch, indices, type); - if (folio_batch_count(&fbatch) == 0) { + if (!shmem_find_swap_entries(mapping, start, &fbatch, + indices, type)) { ret = 0; break; } @@ -1288,7 +1516,8 @@ int shmem_unuse(unsigned int type) if (list_empty(&shmem_swaplist)) return 0; - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); +start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); @@ -1301,72 +1530,79 @@ int shmem_unuse(unsigned int type) * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); - mutex_lock(&shmem_swaplist_mutex); - next = list_next_entry(info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); + spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; + if (list_empty(&info->swaplist)) + goto start_over; + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); } - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); return error; } -/* - * Move the page from the page cache to the swap cache. +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @plug: swap plug + * @folio_list: list to put back folios on split + * + * Move the folio from the page cache to the swap cache. */ -static int shmem_writepage(struct page *page, struct writeback_control *wbc) +int shmem_writeout(struct folio *folio, struct swap_iocb **plug, + struct list_head *folio_list) { - struct folio *folio = page_folio(page); - struct shmem_inode_info *info; - struct address_space *mapping; - struct inode *inode; - swp_entry_t swap; + struct address_space *mapping = folio->mapping; + struct inode *inode = mapping->host; + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); pgoff_t index; + int nr_pages; + bool split = false; + + if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap) + goto redirty; + + if (!total_swap_pages) + goto redirty; /* - * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or - * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, - * and its shmem_writeback() needs them to be split when swapping. + * If CONFIG_THP_SWAP is not enabled, the large folio should be + * split when swapping. + * + * And shrinkage of pages beyond i_size does not split swap, so + * swapout of a large folio crossing i_size needs to split too + * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { + index = shmem_fallocend(inode, + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); + if ((index > folio->index && index < folio_next_index(folio)) || + !IS_ENABLED(CONFIG_THP_SWAP)) + split = true; + } + + if (split) { +try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page(page) < 0) + if (split_folio_to_list(folio, folio_list)) goto redirty; - folio = page_folio(page); folio_clear_dirty(folio); } - BUG_ON(!folio_test_locked(folio)); - mapping = folio->mapping; index = folio->index; - inode = mapping->host; - info = SHMEM_I(inode); - if (info->flags & VM_LOCKED) - goto redirty; - if (!total_swap_pages) - goto redirty; - - /* - * Our capabilities prevent regular writeback or sync from ever calling - * shmem_writepage; but a stacking filesystem might use ->writepage of - * its underlying filesystem, in which case tmpfs should write out to - * swap only in response to memory pressure, and not for the writeback - * threads or sync. - */ - if (!wbc->for_reclaim) { - WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ - goto redirty; - } + nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC @@ -1388,7 +1624,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) - shmem_falloc->nr_unswapped++; + shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); @@ -1400,48 +1636,66 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) folio_mark_uptodate(folio); } - swap = folio_alloc_swap(folio); - if (!swap.val) - goto redirty; + if (!folio_alloc_swap(folio)) { + bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); + int error; + + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now before the folio is + * removed from page cache, when its pagelock no longer + * protects the inode from eviction. And do it now, after + * we've incremented swapped, because shmem_unuse() will + * prune a !swapped inode from the swaplist. + */ + if (first_swapped) { + spin_lock(&shmem_swaplist_lock); + if (list_empty(&info->swaplist)) + list_add(&info->swaplist, &shmem_swaplist); + spin_unlock(&shmem_swaplist_lock); + } + + swap_shmem_alloc(folio->swap, nr_pages); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); - /* - * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the folio is - * moved to swap cache, when its pagelock no longer protects - * the inode from eviction. But don't unlock the mutex until - * we've incremented swapped, because shmem_unuse_inode() will - * prune a !swapped inode from the swaplist under this mutex. - */ - mutex_lock(&shmem_swaplist_mutex); - if (list_empty(&info->swaplist)) - list_add(&info->swaplist, &shmem_swaplist); - - if (add_to_swap_cache(folio, swap, - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, - NULL) == 0) { - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - info->swapped++; - spin_unlock_irq(&info->lock); - - swap_shmem_alloc(swap); - shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); - - mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - swap_writepage(&folio->page, wbc); - return 0; - } + error = swap_writeout(folio, plug); + if (error != AOP_WRITEPAGE_ACTIVATE) { + /* folio has been unlocked */ + return error; + } - mutex_unlock(&shmem_swaplist_mutex); - put_swap_folio(folio, swap); + /* + * The intention here is to avoid holding on to the swap when + * zswap was unable to compress and unable to writeback; but + * it will be appropriate if other reactivate cases are added. + */ + error = shmem_add_to_page_cache(folio, mapping, index, + swp_to_radix_entry(folio->swap), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + /* Swap entry might be erased by racing shmem_free_swap() */ + if (!error) { + shmem_recalc_inode(inode, 0, -nr_pages); + swap_free_nr(folio->swap, nr_pages); + } + + /* + * The swap_cache_del_folio() below could be left for + * shrink_folio_list()'s folio_free_swap() to dispose of; + * but I'm a little nervous about letting this folio out of + * shmem_writeout() in a hybrid half-tmpfs-half-swap state + * e.g. folio_mapping(folio) might give an unexpected answer. + */ + swap_cache_del_folio(folio); + goto redirty; + } + if (nr_pages > 1) + goto try_split; redirty: folio_mark_dirty(folio); - if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ - folio_unlock(folio); - return 0; + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } +EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) @@ -1476,42 +1730,22 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ -#ifndef CONFIG_NUMA -#define vm_policy vm_private_data -#endif - -static void shmem_pseudo_vma_init(struct vm_area_struct *vma, - struct shmem_inode_info *info, pgoff_t index) -{ - /* Create a pseudo vma that just contains the policy */ - vma_init(vma, NULL); - /* Bias interleave by inode number to distribute better across nodes */ - vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); -} -static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) -{ - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(vma->vm_policy); -} +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx); -static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct page *page; - struct vm_fault vmf = { - .vma = &pvma, - }; + struct mempolicy *mpol; + pgoff_t ilx; + struct folio *folio; - shmem_pseudo_vma_init(&pvma, info, index); - page = swap_cluster_readahead(swap, gfp, &vmf); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + folio = swap_cluster_readahead(swap, gfp, mpol, ilx); + mpol_cond_put(mpol); - if (!page) - return NULL; - return page_folio(page); + return folio; } /* @@ -1538,69 +1772,316 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) return result; } -static struct folio *shmem_alloc_hugefolio(gfp_t gfp, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +bool shmem_hpage_pmd_enabled(void) +{ + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) && + shmem_huge != SHMEM_HUGE_NEVER) + return true; + + return false; +} + +unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + loff_t write_end, bool shmem_huge_force) +{ + unsigned long mask = READ_ONCE(huge_shmem_orders_always); + unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); + vm_flags_t vm_flags = vma ? vma->vm_flags : 0; + unsigned int global_orders; + + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) + return 0; + + global_orders = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); + /* Tmpfs huge pages allocation */ + if (!vma || !vma_is_anon_shmem(vma)) + return global_orders; + + /* + * Following the 'deny' semantics of the top level, force the huge + * option off from all mounts. + */ + if (shmem_huge == SHMEM_HUGE_DENY) + return 0; + + /* + * Only allow inherit orders if the top-level value is 'force', which + * means non-PMD sized THP can not override 'huge' mount option now. + */ + if (shmem_huge == SHMEM_HUGE_FORCE) + return READ_ONCE(huge_shmem_orders_inherit); + + /* Allow mTHP that will be fully within i_size. */ + mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_shmem_orders_madvise); + + if (global_orders > 0) + mask |= READ_ONCE(huge_shmem_orders_inherit); + + return THP_ORDERS_ALL_FILE_DEFAULT & mask; +} + +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + pgoff_t aligned_index; + unsigned long pages; + int order; + + if (vma) { + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + if (!orders) + return 0; + } + + /* Find the highest order that can add into the page cache */ + order = highest_order(orders); + while (orders) { + pages = 1UL << order; + aligned_index = round_down(index, pages); + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ + if (!xa_find(&mapping->i_pages, &aligned_index, + aligned_index + pages - 1, XA_PRESENT)) + break; + order = next_order(&orders, order); + } + + return orders; +} +#else +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct address_space *mapping = info->vfs_inode.i_mapping; - pgoff_t hindex; + struct mempolicy *mpol; + pgoff_t ilx; struct folio *folio; - hindex = round_down(index, HPAGE_PMD_NR); - if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, - XA_PRESENT)) - return NULL; + mpol = shmem_get_pgoff_policy(info, index, order, &ilx); + folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - shmem_pseudo_vma_init(&pvma, info, hindex); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); - shmem_pseudo_vma_destroy(&pvma); - if (!folio) - count_vm_event(THP_FILE_FALLBACK); return folio; } -static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) +static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, + gfp_t gfp, struct inode *inode, pgoff_t index, + struct mm_struct *fault_mm, unsigned long orders) { - struct vm_area_struct pvma; - struct folio *folio; + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long suitable_orders = 0; + struct folio *folio = NULL; + pgoff_t aligned_index; + long pages; + int error, order; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + orders = 0; + + if (orders > 0) { + suitable_orders = shmem_suitable_orders(inode, vmf, + mapping, index, orders); + + order = highest_order(suitable_orders); + while (suitable_orders) { + pages = 1UL << order; + aligned_index = round_down(index, pages); + folio = shmem_alloc_folio(gfp, order, info, aligned_index); + if (folio) { + index = aligned_index; + goto allocated; + } + + if (pages == HPAGE_PMD_NR) + count_vm_event(THP_FILE_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); + order = next_order(&suitable_orders, order); + } + } else { + pages = 1; + folio = shmem_alloc_folio(gfp, 0, info, index); + } + if (!folio) + return ERR_PTR(-ENOMEM); + +allocated: + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + gfp &= GFP_RECLAIM_MASK; + error = mem_cgroup_charge(folio, fault_mm, gfp); + if (error) { + if (xa_find(&mapping->i_pages, &index, + index + pages - 1, XA_PRESENT)) { + error = -EEXIST; + } else if (pages > 1) { + if (pages == HPAGE_PMD_NR) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); + } + goto unlock; + } - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); - shmem_pseudo_vma_destroy(&pvma); + error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); + if (error) + goto unlock; + + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + long freed; + /* + * Try to reclaim some space by splitting a few + * large folios beyond i_size on the filesystem. + */ + shmem_unused_huge_shrink(sbinfo, NULL, pages); + /* + * And do a shmem_recalc_inode() to account for freed pages: + * except our folio is there in cache, so not quite balanced. + */ + spin_lock(&info->lock); + freed = pages + info->alloced - info->swapped - + READ_ONCE(mapping->nrpages); + if (freed > 0) + info->alloced -= freed; + spin_unlock(&info->lock); + if (freed > 0) + shmem_inode_unacct_blocks(inode, freed); + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + filemap_remove_folio(folio); + goto unlock; + } + } + shmem_recalc_inode(inode, pages, 0); + folio_add_lru(folio); return folio; + +unlock: + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(error); } -static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge) +static struct folio *shmem_swap_alloc_folio(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); - struct folio *folio; - int nr; - int err = -ENOSPC; + int nr_pages = 1 << order; + struct folio *new; + gfp_t alloc_gfp; + void *shadow; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - huge = false; - nr = huge ? HPAGE_PMD_NR : 1; + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success with further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + alloc_gfp = gfp; + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (WARN_ON_ONCE(order)) + return ERR_PTR(-EINVAL); + } else if (order) { + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. + */ + if ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled() || + non_swapcache_batch(entry, nr_pages) != nr_pages) + goto fallback; - if (!shmem_inode_acct_block(inode, nr)) - goto failed; + alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + } +retry: + new = shmem_alloc_folio(alloc_gfp, order, info, index); + if (!new) { + new = ERR_PTR(-ENOMEM); + goto fallback; + } - if (huge) - folio = shmem_alloc_hugefolio(gfp, info, index); - else - folio = shmem_alloc_folio(gfp, info, index); - if (folio) { - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - return folio; + if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, + alloc_gfp, entry)) { + folio_put(new); + new = ERR_PTR(-ENOMEM); + goto fallback; } - err = -ENOMEM; - shmem_inode_unacct_blocks(inode, nr); -failed: - return ERR_PTR(err); + /* + * Prevent parallel swapin from proceeding with the swap cache flag. + * + * Of course there is another possible concurrent scenario as well, + * that is to say, the swap cache flag of a large folio has already + * been set by swapcache_prepare(), while another thread may have + * already split the large swap entry stored in the shmem mapping. + * In this case, shmem_add_to_page_cache() will help identify the + * concurrent swapin and return -EEXIST. + */ + if (swapcache_prepare(entry, nr_pages)) { + folio_put(new); + new = ERR_PTR(-EEXIST); + /* Try smaller folio to avoid cache conflict */ + goto fallback; + } + + __folio_set_locked(new); + __folio_set_swapbacked(new); + new->swap = entry; + + memcg1_swapin(entry, nr_pages); + shadow = swap_cache_get_shadow(entry); + if (shadow) + workingset_refault(new, shadow); + folio_add_lru(new); + swap_read_folio(new, NULL); + return new; +fallback: + /* Order 0 swapin failed, nothing to fallback to, abort */ + if (!order) + return new; + entry.val += index - round_down(index, nr_pages); + alloc_gfp = gfp; + nr_pages = 1; + order = 0; + goto retry; } /* @@ -1621,102 +2102,164 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + struct vm_area_struct *vma) { - struct folio *old, *new; - struct address_space *swap_mapping; - swp_entry_t entry; - pgoff_t swap_index; - int error; - - old = *foliop; - entry = folio_swap_entry(old); - swap_index = swp_offset(entry); - swap_mapping = swap_address_space(entry); + struct swap_cluster_info *ci; + struct folio *new, *old = *foliop; + swp_entry_t entry = old->swap; + int nr_pages = folio_nr_pages(old); + int error = 0; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, info, index); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages > 1) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } +#endif + + new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; - folio_get(new); + folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); __folio_set_locked(new); __folio_set_swapbacked(new); folio_mark_uptodate(new); - folio_set_swap_entry(new, entry); + new->swap = entry; folio_set_swapcache(new); - /* - * Our caller will very soon move newpage out of swapcache, but it's - * a nice clean interface for us to replace oldpage by newpage there. - */ - xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, old, new); - if (!error) { - mem_cgroup_migrate(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); - __lruvec_stat_mod_folio(new, NR_SHMEM, 1); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); - __lruvec_stat_mod_folio(old, NR_SHMEM, -1); - } - xa_unlock_irq(&swap_mapping->i_pages); - - if (unlikely(error)) { - /* - * Is this possible? I think not, now that our callers check - * both PageSwapCache and page_private after getting page lock; - * but be defensive. Reverse old to newpage for clear and free. - */ - old = new; - } else { - folio_add_lru(new); - *foliop = new; - } + ci = swap_cluster_get_and_lock_irq(old); + __swap_cache_replace_folio(ci, old, new); + mem_cgroup_replace_folio(old, new); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); + swap_cluster_unlock_irq(ci); + + folio_add_lru(new); + *foliop = new; folio_clear_swapcache(old); old->private = NULL; folio_unlock(old); - folio_put_refs(old, 2); + /* + * The old folio are removed from swap cache, drop the 'nr_pages' + * reference, as well as one temporary reference getting from swap + * cache. + */ + folio_put_refs(old, nr_pages + 1); return error; } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, - struct folio *folio, swp_entry_t swap) + struct folio *folio, swp_entry_t swap, + bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swapin_error; void *old; + int nr_pages; - swapin_error = make_swapin_error_entry(); + swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); if (old != swp_to_radix_entry(swap)) return; + nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); - delete_from_swap_cache(folio); - spin_lock_irq(&info->lock); + if (!skip_swapcache) + swap_cache_del_folio(folio); /* - * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't - * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in - * shmem_evict_inode. + * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks + * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) + * in shmem_evict_inode(). */ - info->alloced--; - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - swap_free(swap); + shmem_recalc_inode(inode, -nr_pages, -nr_pages); + swap_free_nr(swap, nr_pages); +} + +static int shmem_split_large_entry(struct inode *inode, pgoff_t index, + swp_entry_t swap, gfp_t gfp) +{ + struct address_space *mapping = inode->i_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); + int split_order = 0; + int i; + + /* Convert user data gfp flags to xarray node gfp flags */ + gfp &= GFP_RECLAIM_MASK; + + for (;;) { + void *old = NULL; + int cur_order; + pgoff_t swap_index; + + xas_lock_irq(&xas); + old = xas_load(&xas); + if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + + cur_order = xas_get_order(&xas); + if (!cur_order) + goto unlock; + + /* Try to split large swap entry in pagecache */ + swap_index = round_down(index, 1 << cur_order); + split_order = xas_try_split_min_order(cur_order); + + while (cur_order > 0) { + pgoff_t aligned_index = + round_down(index, 1 << cur_order); + pgoff_t swap_offset = aligned_index - swap_index; + + xas_set_order(&xas, index, split_order); + xas_try_split(&xas, old, cur_order); + if (xas_error(&xas)) + goto unlock; + + /* + * Re-set the swap entry after splitting, and the swap + * offset of the original large entry must be continuous. + */ + for (i = 0; i < 1 << cur_order; + i += (1 << split_order)) { + swp_entry_t tmp; + + tmp = swp_entry(swp_type(swap), + swp_offset(swap) + swap_offset + + i); + __xa_store(&mapping->i_pages, aligned_index + i, + swp_to_radix_entry(tmp), 0); + } + cur_order = split_order; + split_order = xas_try_split_min_order(split_order); + } + +unlock: + xas_unlock_irq(&xas); + + if (!xas_nomem(&xas, gfp)) + break; + } + + if (xas_error(&xas)) + return xas_error(&xas); + + return 0; } /* @@ -1731,41 +2274,114 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; + struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); - struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; - struct folio *folio = NULL; swp_entry_t swap; - int error; + softleaf_t index_entry; + struct swap_info_struct *si; + struct folio *folio = NULL; + bool skip_swapcache = false; + int error, nr_pages, order; + pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); - swap = radix_to_swp_entry(*foliop); + index_entry = radix_to_swp_entry(*foliop); + swap = index_entry; *foliop = NULL; - if (is_swapin_error_entry(swap)) + if (softleaf_is_poison_marker(index_entry)) return -EIO; + si = get_swap_device(index_entry); + order = shmem_confirm_swap(mapping, index, index_entry); + if (unlikely(!si)) { + if (order < 0) + return -EEXIST; + else + return -EINVAL; + } + if (unlikely(order < 0)) { + put_swap_device(si); + return -EEXIST; + } + + /* index may point to the middle of a large entry, get the sub entry */ + if (order) { + offset = index - round_down(index, 1 << order); + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } + /* Look it up and read it in.. */ - folio = swap_cache_get_folio(swap, NULL, 0); + folio = swap_cache_get_folio(swap); if (!folio) { - /* Or update major stats only when swapin succeeds?? */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + /* Direct swapin skipping swap cache & readahead */ + folio = shmem_swap_alloc_folio(inode, vma, index, + index_entry, order, gfp); + if (IS_ERR(folio)) { + error = PTR_ERR(folio); + folio = NULL; + goto failed; + } + skip_swapcache = true; + } else { + /* Cached swapin only supports order 0 folio */ + folio = shmem_swapin_cluster(swap, gfp, info, index); + if (!folio) { + error = -ENOMEM; + goto failed; + } + } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - count_memcg_event_mm(charge_mm, PGMAJFAULT); - } - /* Here we actually start the io */ - folio = shmem_swapin(swap, gfp, info, index); - if (!folio) { - error = -ENOMEM; - goto failed; + count_memcg_event_mm(fault_mm, PGMAJFAULT); } + } else { + swap_update_readahead(folio, NULL, 0); + } + + if (order > folio_order(folio)) { + /* + * Swapin may get smaller folios due to various reasons: + * It may fallback to order 0 due to memory pressure or race, + * swap readahead may swap in order 0 folios into swapcache + * asynchronously, while the shmem mapping can still stores + * large swap entries. In such cases, we should split the + * large swap entry to prevent possible data corruption. + */ + error = shmem_split_large_entry(inode, index, index_entry, gfp); + if (error) + goto failed_nolock; } - /* We have to do this with folio locked to prevent races */ + /* + * If the folio is large, round down swap and index by folio size. + * No matter what race occurs, the swap layer ensures we either get + * a valid folio that has its swap entry aligned by size, or a + * temporarily invalid one which we'll abort very soon and retry. + * + * shmem_add_to_page_cache ensures the whole range contains expected + * entries and prevents any corruption, so any race split is fine + * too, it will succeed as long as the entries are still there. + */ + nr_pages = folio_nr_pages(folio); + if (nr_pages > 1) { + swap.val = round_down(swap.val, nr_pages); + index = round_down(index, nr_pages); + } + + /* + * We have to do this with the folio locked to prevent races. + * The shmem_confirm_swap below only checks if the first swap + * entry matches the folio, that's enough to ensure the folio + * is not used outside of shmem, as shmem swap entries + * and swap cache folios are never partially freed. + */ folio_lock(folio); - if (!folio_test_swapcache(folio) || - folio_swap_entry(folio).val != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { + if ((!skip_swapcache && !folio_test_swapcache(folio)) || + shmem_confirm_swap(mapping, index, swap) < 0 || + folio->swap.val != swap.val) { error = -EEXIST; goto unlock; } @@ -1779,44 +2395,51 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ - arch_swap_restore(swap, folio); + arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_folio(&folio, gfp, info, index); + error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } error = shmem_add_to_page_cache(folio, mapping, index, - swp_to_radix_entry(swap), gfp, - charge_mm); + swp_to_radix_entry(swap), gfp); if (error) goto failed; - spin_lock_irq(&info->lock); - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); - delete_from_swap_cache(folio); + if (skip_swapcache) { + folio->swap.val = 0; + swapcache_clear(si, swap, nr_pages); + } else { + swap_cache_del_folio(folio); + } folio_mark_dirty(folio); - swap_free(swap); + swap_free_nr(swap, nr_pages); + put_swap_device(si); *foliop = folio; return 0; failed: - if (!shmem_confirm_swap(mapping, index, swap)) + if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) - shmem_set_folio_swapin_error(inode, index, folio, swap); + shmem_set_folio_swapin_error(inode, index, folio, swap, + skip_swapcache); unlock: - if (folio) { + if (folio) folio_unlock(folio); +failed_nolock: + if (skip_swapcache) + swapcache_clear(si, folio->swap, folio_nr_pages(folio)); + if (folio) folio_put(folio); - } + put_swap_device(si); return error; } @@ -1828,49 +2451,43 @@ unlock: * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vma, vmf, and fault_type are only supplied by shmem_fault: - * otherwise they are NULL. + * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) + loff_t write_end, struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { - struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo; - struct mm_struct *charge_mm; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + struct mm_struct *fault_mm; struct folio *folio; - pgoff_t hindex; - gfp_t huge_gfp; int error; - int once = 0; - int alloced = 0; + bool alloced; + unsigned long orders = 0; + + if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) + return -EINVAL; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: if (sgp <= SGP_CACHE && - ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; - } - sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : NULL; + alloced = false; + fault_mm = vma ? vma->vm_mm : NULL; - folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0); + folio = filemap_get_entry(inode->i_mapping, index); if (folio && vma && userfaultfd_minor(vma)) { - if (!xa_is_value(folio)) { - folio_unlock(folio); + if (!xa_is_value(folio)) folio_put(folio); - } *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); return 0; } if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, vma, fault_type); + sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; @@ -1879,6 +2496,14 @@ repeat: } if (folio) { + folio_lock(folio); + + /* Has the folio been truncated or swapped out? */ + if (unlikely(folio->mapping != inode->i_mapping)) { + folio_unlock(folio); + folio_put(folio); + goto repeat; + } if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (folio_test_uptodate(folio)) @@ -1909,61 +2534,41 @@ repeat: return 0; } - if (!shmem_is_huge(vma, inode, index, false)) - goto alloc_nohuge; - - huge_gfp = vma_thp_gfp_mask(vma); - huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); - if (IS_ERR(folio)) { -alloc_nohuge: - folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); + /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ + orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); + if (orders > 0) { + gfp_t huge_gfp; + + huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); + folio = shmem_alloc_and_add_folio(vmf, huge_gfp, + inode, index, fault_mm, orders); + if (!IS_ERR(folio)) { + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_FILE_ALLOC); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); + goto alloced; + } + if (PTR_ERR(folio) == -EEXIST) + goto repeat; } - if (IS_ERR(folio)) { - int retry = 5; + folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); + if (IS_ERR(folio)) { error = PTR_ERR(folio); + if (error == -EEXIST) + goto repeat; folio = NULL; - if (error != -ENOSPC) - goto unlock; - /* - * Try to reclaim some space by splitting a large folio - * beyond i_size on the filesystem. - */ - while (retry--) { - int ret; - - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); - if (ret == SHRINK_STOP) - break; - if (ret) - goto alloc_nohuge; - } goto unlock; } - hindex = round_down(index, folio_nr_pages(folio)); - - if (sgp == SGP_WRITE) - __folio_set_referenced(folio); - - error = shmem_add_to_page_cache(folio, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK, - charge_mm); - if (error) - goto unacct; - folio_add_lru(folio); - - spin_lock_irq(&info->lock); - info->alloced += folio_nr_pages(folio); - inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); +alloced: alloced = true; - - if (folio_test_pmd_mappable(folio) && + if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - folio_next_index(folio) - 1) { + folio_next_index(folio)) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. @@ -1981,6 +2586,8 @@ alloc_nohuge: spin_unlock(&sbinfo->shrinklist_lock); } + if (sgp == SGP_WRITE) + folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ @@ -2004,13 +2611,6 @@ clear: /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { - if (alloced) { - folio_clear_dirty(folio); - filemap_remove_folio(folio); - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - } error = -EINVAL; goto unlock; } @@ -2021,123 +2621,147 @@ out: /* * Error recovery. */ -unacct: - shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); - - if (folio_test_large(folio)) { - folio_unlock(folio); - folio_put(folio); - goto alloc_nohuge; - } unlock: + if (alloced) + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); if (folio) { folio_unlock(folio); folio_put(folio); } - if (error == -ENOSPC && !once++) { - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - goto repeat; - } - if (error == -EEXIST) - goto repeat; return error; } -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp) +/** + * shmem_get_folio - find, and lock a shmem folio. + * @inode: inode to search + * @index: the page index. + * @write_end: end of a write, could extend inode size + * @foliop: pointer to the folio if found + * @sgp: SGP_* flags to control behavior + * + * Looks up the page cache entry at @inode & @index. If a folio is + * present, it is returned locked with an increased refcount. + * + * If the caller modifies data in the folio, it must call folio_mark_dirty() + * before unlocking the folio to ensure that the folio is not reclaimed. + * There is no need to reserve space before calling folio_mark_dirty(). + * + * When no folio is found, the behavior depends on @sgp: + * - for SGP_READ, *@foliop is %NULL and 0 is returned + * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned + * - for all other flags a new folio is allocated, inserted into the + * page cache and returned locked in @foliop. + * + * Context: May sleep. + * Return: 0 if successful, else a negative error code. + */ +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp) { - return shmem_get_folio_gfp(inode, index, foliop, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); + return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL); } +EXPORT_SYMBOL_GPL(shmem_get_folio); /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the * target. */ -static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->entry); return ret; } +/* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_rwsem. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_rwsem in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ +static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) +{ + struct shmem_falloc *shmem_falloc; + struct file *fpin = NULL; + vm_fault_t ret = 0; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); + + ret = VM_FAULT_NOPAGE; + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + } + spin_unlock(&inode->i_lock); + if (fpin) { + fput(fpin); + ret = VM_FAULT_RETRY; + } + return ret; +} + static vm_fault_t shmem_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = vmf->vma; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; + vm_fault_t ret = 0; int err; - vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can - * prevent the hole-punch from ever completing: which in turn - * locks writers out with its hold on i_rwsem. So refrain from - * faulting pages into the hole while it's being punched. Although - * shmem_undo_range() does remove the additions, it may be unable to - * keep up, as each new page needs its own unmap_mapping_range() call, - * and the i_mmap tree grows ever slower to scan if new vmas are added. - * - * It does not matter if we sometimes reach this check just before the - * hole-punch begins, so that one fault then races with the punch: - * we just need to make racing faults a rare case. - * - * The implementation below would be much simpler if we just used a - * standard mutex or completion: but we cannot take i_rwsem in fault, - * and bloating every shmem inode for this unlikely case would be sad. + * prevent the hole-punch from ever completing: noted in i_private. */ if (unlikely(inode->i_private)) { - struct shmem_falloc *shmem_falloc; - - spin_lock(&inode->i_lock); - shmem_falloc = inode->i_private; - if (shmem_falloc && - shmem_falloc->waitq && - vmf->pgoff >= shmem_falloc->start && - vmf->pgoff < shmem_falloc->next) { - struct file *fpin; - wait_queue_head_t *shmem_falloc_waitq; - DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); - - ret = VM_FAULT_NOPAGE; - fpin = maybe_unlock_mmap_for_io(vmf, NULL); - if (fpin) - ret = VM_FAULT_RETRY; - - shmem_falloc_waitq = shmem_falloc->waitq; - prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); - schedule(); - - /* - * shmem_falloc_waitq points into the shmem_fallocate() - * stack of the hole-punching task: shmem_falloc_waitq - * is usually invalid by the time we reach here, but - * finish_wait() does not dereference it in that case; - * though i_lock needed lest racing with wake_up_all(). - */ - spin_lock(&inode->i_lock); - finish_wait(shmem_falloc_waitq, &shmem_fault_wait); - spin_unlock(&inode->i_lock); - - if (fpin) - fput(fpin); + ret = shmem_falloc_wait(vmf, inode); + if (ret) return ret; - } - spin_unlock(&inode->i_lock); } - err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, - gfp, vma, vmf, &ret); + WARN_ON_ONCE(vmf->page != NULL); + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, + gfp, vmf, &ret); if (err) return vmf_error(err); - if (folio) + if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); + ret |= VM_FAULT_LOCKED; + } return ret; } @@ -2145,19 +2769,17 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long (*get_area)(struct file *, - unsigned long, unsigned long, unsigned long, unsigned long); unsigned long addr; unsigned long offset; unsigned long inflated_len; unsigned long inflated_addr; unsigned long inflated_offset; + unsigned long hpage_size; if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; - addr = get_area(file, uaddr, len, pgoff, flags); + addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2170,8 +2792,6 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (shmem_huge == SHMEM_HUGE_DENY) return addr; - if (len < HPAGE_PMD_SIZE) - return addr; if (flags & MAP_FIXED) return addr; /* @@ -2183,8 +2803,11 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (uaddr == addr) return addr; + hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; + unsigned long __maybe_unused hpage_orders; + int order = 0; if (file) { VM_BUG_ON(file->f_op != &shmem_file_operations); @@ -2197,33 +2820,53 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (IS_ERR(shm_mnt)) return addr; sb = shm_mnt->mnt_sb; + + /* + * Find the highest mTHP order used for anonymous shmem to + * provide a suitable alignment address. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + hpage_orders = READ_ONCE(huge_shmem_orders_always); + hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); + hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); + + if (hpage_orders > 0) { + order = highest_order(hpage_orders); + hpage_size = PAGE_SIZE << order; + } +#endif } - if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) + if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr; } - offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); - if (offset && offset + len < 2 * HPAGE_PMD_SIZE) + if (len < hpage_size) return addr; - if ((addr & (HPAGE_PMD_SIZE-1)) == offset) + + offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); + if (offset && offset + len < 2 * hpage_size) + return addr; + if ((addr & (hpage_size - 1)) == offset) return addr; - inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; + inflated_len = len + hpage_size - PAGE_SIZE; if (inflated_len > TASK_SIZE) return addr; if (inflated_len < len) return addr; - inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) return addr; - inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); + inflated_offset = inflated_addr & (hpage_size - 1); inflated_addr += offset - inflated_offset; if (inflated_offset > offset) - inflated_addr += HPAGE_PMD_SIZE; + inflated_addr += hpage_size; if (inflated_addr > TASK_SIZE - len) return addr; @@ -2238,15 +2881,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; + /* + * Bias interleave by inode number to distribute better across nodes; + * but this interface is independent of which page order is used, so + * supplies only that bias, letting caller apply the offset (adjusted + * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). + */ + *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } -#endif + +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + struct mempolicy *mpol; + + /* Bias interleave by inode number to distribute better across nodes */ + *ilx = info->vfs_inode.i_ino + (index >> order); + + mpol = mpol_shared_policy_lookup(&info->policy, index); + return mpol ? mpol : get_task_policy(current); +} +#else +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} +#endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { @@ -2259,15 +2928,15 @@ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) * ipc_lock_object() when called from shmctl_do_lock(), * no serialization needed when called from shm_destroy(). */ - if (lock && !(info->flags & VM_LOCKED)) { + if (lock && !(info->flags & SHMEM_F_LOCKED)) { if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; - info->flags |= VM_LOCKED; + info->flags |= SHMEM_F_LOCKED; mapping_set_unevictable(file->f_mapping); } - if (!lock && (info->flags & VM_LOCKED) && ucounts) { + if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) { user_shm_unlock(inode->i_size, ucounts); - info->flags &= ~VM_LOCKED; + info->flags &= ~SHMEM_F_LOCKED; mapping_clear_unevictable(file->f_mapping); } retval = 0; @@ -2276,38 +2945,85 @@ out_nomem: return retval; } -static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +static int shmem_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); - int ret; - - ret = seal_check_future_write(info->seals, vma); - if (ret) - return ret; - - /* arm64 - allow memory tagging on RAM-based files */ - vma->vm_flags |= VM_MTE_ALLOWED; file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink) - vma->vm_ops = &shmem_vm_ops; + desc->vm_ops = &shmem_vm_ops; else - vma->vm_ops = &shmem_anon_vm_ops; + desc->vm_ops = &shmem_anon_vm_ops; return 0; } +static int shmem_file_open(struct inode *inode, struct file *file) +{ + file->f_mode |= FMODE_CAN_ODIRECT; + return generic_file_open(inode, file); +} + #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); +#if IS_ENABLED(CONFIG_UNICODE) +/* + * shmem_inode_casefold_flags - Deal with casefold file attribute flag + * + * The casefold file attribute needs some special checks. I can just be added to + * an empty dir, and can't be removed from a non-empty dir. + */ +static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, + struct dentry *dentry, unsigned int *i_flags) +{ + unsigned int old = inode->i_flags; + struct super_block *sb = inode->i_sb; + + if (fsflags & FS_CASEFOLD_FL) { + if (!(old & S_CASEFOLD)) { + if (!sb->s_encoding) + return -EOPNOTSUPP; + + if (!S_ISDIR(inode->i_mode)) + return -ENOTDIR; + + if (dentry && !simple_empty(dentry)) + return -ENOTEMPTY; + } + + *i_flags = *i_flags | S_CASEFOLD; + } else if (old & S_CASEFOLD) { + if (dentry && !simple_empty(dentry)) + return -ENOTEMPTY; + } + + return 0; +} +#else +static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, + struct dentry *dentry, unsigned int *i_flags) +{ + if (fsflags & FS_CASEFOLD_FL) + return -EOPNOTSUPP; + + return 0; +} +#endif + /* * chattr's fsflags are unrelated to extended attributes, * but tmpfs has chosen to enable them under the same config option. */ -static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { unsigned int i_flags = 0; + int ret; + + ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); + if (ret) + return ret; if (fsflags & FS_NOATIME_FL) i_flags |= S_NOATIME; @@ -2318,92 +3034,147 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) /* * But FS_NODUMP_FL does not require any action in i_flags. */ - inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); + inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD); + + return 0; } #else -static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { } #define shmem_initxattrs NULL #endif -static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) +static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) +{ + return &SHMEM_I(inode)->dir_offsets; +} + +static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, + struct inode *dir, umode_t mode, + dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; + int err; - if (shmem_reserve_inode(sb, &ino)) - return NULL; + err = shmem_reserve_inode(sb, &ino); + if (err) + return ERR_PTR(err); inode = new_inode(sb); - if (inode) { - inode->i_ino = ino; - inode_init_owner(&init_user_ns, inode, dir, mode); - inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_generation = get_random_u32(); - info = SHMEM_I(inode); - memset(info, 0, (char *)inode - (char *)info); - spin_lock_init(&info->lock); - atomic_set(&info->stop_eviction, 0); - info->seals = F_SEAL_SEAL; - info->flags = flags & VM_NORESERVE; - info->i_crtime = inode->i_mtime; - info->fsflags = (dir == NULL) ? 0 : - SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; - if (info->fsflags) - shmem_set_inode_flags(inode, info->fsflags); - INIT_LIST_HEAD(&info->shrinklist); - INIT_LIST_HEAD(&info->swaplist); - simple_xattrs_init(&info->xattrs); - cache_no_acl(inode); + if (!inode) { + shmem_free_inode(sb, 0); + return ERR_PTR(-ENOSPC); + } + + inode->i_ino = ino; + inode_init_owner(idmap, inode, dir, mode); + inode->i_blocks = 0; + simple_inode_init_ts(inode); + inode->i_generation = get_random_u32(); + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); + spin_lock_init(&info->lock); + atomic_set(&info->stop_eviction, 0); + info->seals = F_SEAL_SEAL; + info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; + info->i_crtime = inode_get_mtime(inode); + info->fsflags = (dir == NULL) ? 0 : + SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; + if (info->fsflags) + shmem_set_inode_flags(inode, info->fsflags, NULL); + INIT_LIST_HEAD(&info->shrinklist); + INIT_LIST_HEAD(&info->swaplist); + simple_xattrs_init(&info->xattrs); + cache_no_acl(inode); + if (sbinfo->noswap) + mapping_set_unevictable(inode->i_mapping); + + /* Don't consider 'deny' for emergencies and 'force' for testing */ + if (sbinfo->huge) mapping_set_large_folios(inode->i_mapping); - switch (mode & S_IFMT) { - default: - inode->i_op = &shmem_special_inode_operations; - init_special_inode(inode, mode, dev); - break; - case S_IFREG: - inode->i_mapping->a_ops = &shmem_aops; - inode->i_op = &shmem_inode_operations; - inode->i_fop = &shmem_file_operations; - mpol_shared_policy_init(&info->policy, - shmem_get_sbmpol(sbinfo)); - break; - case S_IFDIR: - inc_nlink(inode); - /* Some things misbehave if size == 0 on a directory */ - inode->i_size = 2 * BOGO_DIRENT_SIZE; - inode->i_op = &shmem_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - break; - case S_IFLNK: - /* - * Must not load anything in the rbtree, - * mpol_free_shared_policy will not be called. - */ - mpol_shared_policy_init(&info->policy, NULL); - break; - } + switch (mode & S_IFMT) { + default: + inode->i_op = &shmem_special_inode_operations; + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_mapping->a_ops = &shmem_aops; + inode->i_op = &shmem_inode_operations; + inode->i_fop = &shmem_file_operations; + mpol_shared_policy_init(&info->policy, + shmem_get_sbmpol(sbinfo)); + break; + case S_IFDIR: + inc_nlink(inode); + /* Some things misbehave if size == 0 on a directory */ + inode->i_size = 2 * BOGO_DIRENT_SIZE; + inode->i_op = &shmem_dir_inode_operations; + inode->i_fop = &simple_offset_dir_operations; + simple_offset_init(shmem_get_offset_ctx(inode)); + break; + case S_IFLNK: + /* + * Must not load anything in the rbtree, + * mpol_free_shared_policy will not be called. + */ + mpol_shared_policy_init(&info->policy, NULL); + break; + } + + lockdep_annotate_inode_mutex_key(inode); + return inode; +} + +#ifdef CONFIG_TMPFS_QUOTA +static struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + int err; + struct inode *inode; + + inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); + if (IS_ERR(inode)) + return inode; + + err = dquot_initialize(inode); + if (err) + goto errout; - lockdep_annotate_inode_mutex_key(inode); - } else - shmem_free_inode(sb); + err = dquot_alloc_inode(inode); + if (err) { + dquot_drop(inode); + goto errout; + } return inode; + +errout: + inode->i_flags |= S_NOQUOTA; + iput(inode); + return ERR_PTR(err); } +#else +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); +} +#endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD -int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, +int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - bool zeropage, bool wp_copy, - struct page **pagep) + uffd_flags_t flags, + struct folio **foliop) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); @@ -2415,26 +3186,26 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, int ret; pgoff_t max_off; - if (!shmem_inode_acct_block(inode, 1)) { + if (shmem_inode_acct_blocks(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to * avoid a BUG_ON in our caller. */ - if (unlikely(*pagep)) { - put_page(*pagep); - *pagep = NULL; + if (unlikely(*foliop)) { + folio_put(*foliop); + *foliop = NULL; } return -ENOMEM; } - if (!*pagep) { + if (!*foliop) { ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, info, pgoff); + folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks; - if (!zeropage) { /* COPY */ + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { page_kaddr = kmap_local_folio(folio, 0); /* * The read mmap_lock is held here. Despite the @@ -2460,7 +3231,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { - *pagep = &folio->page; + *foliop = folio; ret = -ENOENT; /* don't free the page */ goto out_unacct_blocks; @@ -2471,9 +3242,9 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, clear_user_highpage(&folio->page, dst_addr); } } else { - folio = page_folio(*pagep); + folio = *foliop; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - *pagep = NULL; + *foliop = NULL; } VM_BUG_ON(folio_test_locked(folio)); @@ -2487,22 +3258,19 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(pgoff >= max_off)) goto out_release; - ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_mm); + ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); + if (ret) + goto out_release; + ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (ret) goto out_release; - ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - &folio->page, true, wp_copy); + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, + &folio->page, true, flags); if (ret) goto out_delete_from_cache; - spin_lock_irq(&info->lock); - info->alloced++; - inode->i_blocks += BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - + shmem_recalc_inode(inode, 1, 0); folio_unlock(folio); return 0; out_delete_from_cache: @@ -2521,9 +3289,9 @@ static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int -shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct page **pagep, void **fsdata) +shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -2540,54 +3308,45 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) && + pos + len > inode->i_size)) + return -EPERM; + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; - *pagep = folio_file_page(folio, index); - if (PageHWPoison(*pagep)) { + if (folio_contain_hwpoisoned_page(folio)) { folio_unlock(folio); folio_put(folio); - *pagep = NULL; return -EIO; } + *foliop = folio; return 0; } static int -shmem_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - if (!PageUptodate(page)) { - struct page *head = compound_head(page); - if (PageTransCompound(page)) { - int i; - - for (i = 0; i < HPAGE_PMD_NR; i++) { - if (head + i == page) - continue; - clear_highpage(head + i); - flush_dcache_page(head + i); - } - } - if (copied < PAGE_SIZE) { - unsigned from = pos & (PAGE_SIZE - 1); - zero_user_segments(page, 0, from, - from + copied, PAGE_SIZE); + if (!folio_test_uptodate(folio)) { + if (copied < folio_size(folio)) { + size_t from = offset_in_folio(folio, pos); + folio_zero_segments(folio, 0, from, + from + copied, folio_size(folio)); } - SetPageUptodate(head); + folio_mark_uptodate(folio); } - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); return copied; } @@ -2601,28 +3360,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) unsigned long offset; int error = 0; ssize_t retval = 0; - loff_t *ppos = &iocb->ki_pos; - - index = *ppos >> PAGE_SHIFT; - offset = *ppos & ~PAGE_MASK; for (;;) { struct folio *folio = NULL; struct page *page = NULL; - pgoff_t end_index; unsigned long nr, ret; - loff_t i_size = i_size_read(inode); + loff_t end_offset, i_size = i_size_read(inode); + bool fallback_page_copy = false; + size_t fsize; - end_index = i_size >> PAGE_SHIFT; - if (index > end_index) + if (unlikely(iocb->ki_pos >= i_size)) break; - if (index == end_index) { - nr = i_size & ~PAGE_MASK; - if (nr <= offset) - break; - } - error = shmem_get_folio(inode, index, &folio, SGP_READ); + index = iocb->ki_pos >> PAGE_SHIFT; + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -2637,24 +3388,29 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) error = -EIO; break; } + + if (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio)) + fallback_page_copy = true; } /* * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ - nr = PAGE_SIZE; i_size = i_size_read(inode); - end_index = i_size >> PAGE_SHIFT; - if (index == end_index) { - nr = i_size & ~PAGE_MASK; - if (nr <= offset) { - if (folio) - folio_put(folio); - break; - } + if (unlikely(iocb->ki_pos >= i_size)) { + if (folio) + folio_put(folio); + break; } - nr -= offset; + end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); + if (folio && likely(!fallback_page_copy)) + fsize = folio_size(folio); + else + fsize = PAGE_SIZE; + offset = iocb->ki_pos & (fsize - 1); + nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); if (folio) { /* @@ -2662,10 +3418,15 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + if (mapping_writably_mapped(mapping)) { + if (likely(!fallback_page_copy)) + flush_dcache_folio(folio); + else + flush_dcache_page(page); + } + /* - * Mark the page accessed if we read the beginning. + * Mark the folio accessed if we read the beginning. */ if (!offset) folio_mark_accessed(folio); @@ -2673,9 +3434,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ - ret = copy_page_to_iter(page, offset, nr, to); + if (likely(!fallback_page_copy)) + ret = copy_folio_to_iter(folio, offset, nr, to); + else + ret = copy_page_to_iter(page, offset, nr, to); folio_put(folio); - } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but @@ -2693,9 +3456,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } retval += ret; - offset += ret; - index += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; + iocb->ki_pos += ret; if (!iov_iter_count(to)) break; @@ -2706,11 +3467,189 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) cond_resched(); } - *ppos = ((loff_t) index << PAGE_SHIFT) + offset; file_accessed(file); return retval ? retval : error; } +static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + ret = file_remove_privs(file); + if (ret) + goto unlock; + ret = file_update_time(file); + if (ret) + goto unlock; + ret = generic_perform_write(iocb, from); +unlock: + inode_unlock(inode); + return ret; +} + +static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return true; +} + +static void zero_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ +} + +static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return false; +} + +static const struct pipe_buf_operations zero_pipe_buf_ops = { + .release = zero_pipe_buf_release, + .try_steal = zero_pipe_buf_try_steal, + .get = zero_pipe_buf_get, +}; + +static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, + loff_t fpos, size_t size) +{ + size_t offset = fpos & ~PAGE_MASK; + + size = min_t(size_t, size, PAGE_SIZE - offset); + + if (!pipe_is_full(pipe)) { + struct pipe_buffer *buf = pipe_head_buf(pipe); + + *buf = (struct pipe_buffer) { + .ops = &zero_pipe_buf_ops, + .page = ZERO_PAGE(0), + .offset = offset, + .len = size, + }; + pipe->head++; + } + + return size; +} + +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + struct address_space *mapping = inode->i_mapping; + struct folio *folio = NULL; + size_t total_spliced = 0, used, npages, n, part; + loff_t isize; + int error = 0; + + /* Work out how much data we can actually add into the pipe */ + used = pipe_buf_usage(pipe); + npages = max_t(ssize_t, pipe->max_usage - used, 0); + len = min_t(size_t, len, npages * PAGE_SIZE); + + do { + bool fallback_page_splice = false; + struct page *page = NULL; + pgoff_t index; + size_t size; + + if (*ppos >= i_size_read(inode)) + break; + + index = *ppos >> PAGE_SHIFT; + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); + if (error) { + if (error == -EINVAL) + error = 0; + break; + } + if (folio) { + folio_unlock(folio); + + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { + error = -EIO; + break; + } + + if (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio)) + fallback_page_splice = true; + } + + /* + * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + isize = i_size_read(inode); + if (unlikely(*ppos >= isize)) + break; + /* + * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned + * pages. + */ + size = len; + if (unlikely(fallback_page_splice)) { + size_t offset = *ppos & ~PAGE_MASK; + + size = umin(size, PAGE_SIZE - offset); + } + part = min_t(loff_t, isize - *ppos, size); + + if (folio) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) { + if (likely(!fallback_page_splice)) + flush_dcache_folio(folio); + else + flush_dcache_page(page); + } + folio_mark_accessed(folio); + /* + * Ok, we have the page, and it's up-to-date, so we can + * now splice it into the pipe. + */ + n = splice_folio_into_pipe(pipe, folio, *ppos, part); + folio_put(folio); + folio = NULL; + } else { + n = splice_zeropage_into_pipe(pipe, *ppos, part); + } + + if (!n) + break; + len -= n; + total_spliced += n; + *ppos += n; + in->f_ra.prev_pos = *ppos; + if (pipe_is_full(pipe)) + break; + + cond_resched(); + } while (len); + + if (folio) + folio_put(folio); + + file_accessed(in); + return total_spliced ? total_spliced : error; +} + static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; @@ -2746,6 +3685,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode_lock(inode); + if (info->flags & SHMEM_F_MAPPING_FROZEN) { + error = -EPERM; + goto out; + } + if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); @@ -2820,16 +3764,19 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, struct folio *folio; /* - * Good, the fallocate(2) manpage permits EINTR: we may have - * been interrupted because we are using up too much memory. + * Check for fatal signal so that we abort early in OOM + * situations. We don't want to abort in case of non-fatal + * signals as large fallocate can take noticeable time and + * e.g. periodic timers may result in fallocate constantly + * restarting. */ - if (signal_pending(current)) + if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_get_folio(inode, index, &folio, - SGP_FALLOC); + error = shmem_get_folio(inode, index, offset + len, + &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ @@ -2852,7 +3799,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, index--; /* - * Inform shmem_writepage() how far we have reached. + * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) @@ -2900,7 +3847,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; - buf->f_ffree = sbinfo->free_inodes; + buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; } /* else leave those fields 0 like simple_statfs */ @@ -2913,86 +3860,97 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) * File creation. Allocate an inode, and we're done.. */ static int -shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, +shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; - int error = -ENOSPC; + int error; - inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); - if (inode) { - error = simple_acl_create(dir, inode); - if (error) - goto out_iput; - error = security_inode_init_security(inode, dir, - &dentry->d_name, - shmem_initxattrs, NULL); - if (error && error != -EOPNOTSUPP) - goto out_iput; + if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) + return -EINVAL; - error = 0; - dir->i_size += BOGO_DIRENT_SIZE; - dir->i_ctime = dir->i_mtime = current_time(dir); - inode_inc_iversion(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - } + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + error = security_inode_init_security(inode, dir, &dentry->d_name, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + + error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (error) + goto out_iput; + + dir->i_size += BOGO_DIRENT_SIZE; + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + inode_inc_iversion(dir); + + d_make_persistent(dentry, inode); return error; + out_iput: iput(inode); return error; } static int -shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, +shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; - int error = -ENOSPC; + int error; - inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); - if (inode) { - error = security_inode_init_security(inode, dir, - NULL, - shmem_initxattrs, NULL); - if (error && error != -EOPNOTSUPP) - goto out_iput; - error = simple_acl_create(dir, inode); - if (error) - goto out_iput; - d_tmpfile(file, inode); + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto err_out; } + error = security_inode_init_security(inode, dir, NULL, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + d_tmpfile(file, inode); + +err_out: return finish_open_simple(file, error); out_iput: iput(inode); return error; } -static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; - if ((error = shmem_mknod(&init_user_ns, dir, dentry, - mode | S_IFDIR, 0))) - return error; + error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); + if (error) + return ERR_PTR(error); inc_nlink(dir); - return 0; + return NULL; } -static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir, +static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); + return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } /* * Link a file.. */ -static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +static int shmem_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - int ret = 0; + int ret; /* * No ordinary (disk based) filesystem counts links as inodes; @@ -3004,18 +3962,19 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr if (inode->i_nlink) { ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) - goto out; + return ret; + } + + ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (ret) { + if (inode->i_nlink) + shmem_free_inode(inode->i_sb, 0); + return ret; } dir->i_size += BOGO_DIRENT_SIZE; - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); inode_inc_iversion(dir); - inc_nlink(inode); - ihold(inode); /* New dentry reference */ - dget(dentry); /* Extra pinning count for the created dentry */ - d_instantiate(dentry, inode); -out: - return ret; + return simple_link(old_dentry, dir, dentry); } static int shmem_unlink(struct inode *dir, struct dentry *dentry) @@ -3023,13 +3982,21 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) - shmem_free_inode(inode->i_sb); + shmem_free_inode(inode->i_sb, 0); + + simple_offset_remove(shmem_get_offset_ctx(dir), dentry); dir->i_size -= BOGO_DIRENT_SIZE; - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); inode_inc_iversion(dir); - drop_nlink(inode); - dput(dentry); /* Undo the count from "create" - this does all the work */ + simple_unlink(dir, dentry); + + /* + * For now, VFS can't deal with case-insensitive negative dentries, so + * we invalidate them + */ + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_invalidate(dentry); + return 0; } @@ -3043,7 +4010,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) return shmem_unlink(dir, dentry); } -static int shmem_whiteout(struct user_namespace *mnt_userns, +static int shmem_whiteout(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry) { struct dentry *whiteout; @@ -3053,7 +4020,7 @@ static int shmem_whiteout(struct user_namespace *mnt_userns, if (!whiteout) return -ENOMEM; - error = shmem_mknod(&init_user_ns, old_dir, whiteout, + error = shmem_mknod(idmap, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); if (error) @@ -3076,31 +4043,35 @@ static int shmem_whiteout(struct user_namespace *mnt_userns, * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ -static int shmem_rename2(struct user_namespace *mnt_userns, +static int shmem_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); + int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) - return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + return simple_offset_rename_exchange(old_dir, old_dentry, + new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { - int error; - - error = shmem_whiteout(&init_user_ns, old_dir, old_dentry); + error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error; } + error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); + if (error) + return error; + if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { @@ -3114,54 +4085,54 @@ static int shmem_rename2(struct user_namespace *mnt_userns, old_dir->i_size -= BOGO_DIRENT_SIZE; new_dir->i_size += BOGO_DIRENT_SIZE; - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - inode->i_ctime = current_time(old_dir); + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); return 0; } -static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int error; int len; struct inode *inode; struct folio *folio; + char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0, + inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); - if (!inode) - return -ENOSPC; + if (IS_ERR(inode)) + return PTR_ERR(inode); error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error && error != -EOPNOTSUPP) { - iput(inode); - return error; - } + if (error && error != -EOPNOTSUPP) + goto out_iput; + + error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (error) + goto out_iput; inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - inode->i_link = kmemdup(symname, len, GFP_KERNEL); - if (!inode->i_link) { - iput(inode); - return -ENOMEM; + link = kmemdup(symname, len, GFP_KERNEL); + if (!link) { + error = -ENOMEM; + goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; + inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); - if (error) { - iput(inode); - return error; - } inode->i_mapping->a_ops = &shmem_aops; + error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); + if (error) + goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); folio_mark_uptodate(folio); @@ -3170,11 +4141,16 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); - d_instantiate(dentry, inode); - dget(dentry); + d_make_persistent(dentry, inode); return 0; + +out_remove_offset: + simple_offset_remove(shmem_get_offset_ctx(dir), dentry); +out_iput: + iput(inode); + return error; } static void shmem_put_link(void *arg) @@ -3183,8 +4159,7 @@ static void shmem_put_link(void *arg) folio_put(arg); } -static const char *shmem_get_link(struct dentry *dentry, - struct inode *inode, +static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct folio *folio = NULL; @@ -3192,7 +4167,7 @@ static const char *shmem_get_link(struct dentry *dentry, if (!dentry) { folio = filemap_get_folio(inode->i_mapping, 0); - if (!folio) + if (IS_ERR(folio)) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { @@ -3200,7 +4175,7 @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); } } else { - error = shmem_get_folio(inode, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) @@ -3218,7 +4193,7 @@ static const char *shmem_get_link(struct dentry *dentry, #ifdef CONFIG_TMPFS_XATTR -static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); @@ -3227,22 +4202,29 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) return 0; } -static int shmem_fileattr_set(struct user_namespace *mnt_userns, - struct dentry *dentry, struct fileattr *fa) +static int shmem_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); + int ret, flags; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) return -EOPNOTSUPP; - info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); - shmem_set_inode_flags(inode, info->fsflags); - inode->i_ctime = current_time(inode); + ret = shmem_set_inode_flags(inode, flags, dentry); + + if (ret) + return ret; + + info->fsflags = flags; + + inode_set_ctime_current(inode); inode_inc_iversion(inode); return 0; } @@ -3258,25 +4240,43 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns, * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, - const struct xattr *xattr_array, - void *fs_info) + const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; struct simple_xattr *new_xattr; + size_t ispace = 0; size_t len; + if (sbinfo->max_inodes) { + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + ispace += simple_xattr_space(xattr->name, + xattr->value_len + XATTR_SECURITY_PREFIX_LEN); + } + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_ispace < ispace) + ispace = 0; + else + sbinfo->free_ispace -= ispace; + raw_spin_unlock(&sbinfo->stat_lock); + if (!ispace) + return -ENOSPC; + } + } + for (xattr = xattr_array; xattr->name != NULL; xattr++) { new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) - return -ENOMEM; + break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { kvfree(new_xattr); - return -ENOMEM; + break; } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, @@ -3287,6 +4287,16 @@ static int shmem_initxattrs(struct inode *inode, simple_xattr_add(&info->xattrs, new_xattr); } + if (xattr->name != NULL) { + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_ispace += ispace; + raw_spin_unlock(&sbinfo->stat_lock); + } + simple_xattrs_free(&info->xattrs, NULL); + return -ENOMEM; + } + return 0; } @@ -3301,21 +4311,46 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, } static int shmem_xattr_handler_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); - int err; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct simple_xattr *old_xattr; + size_t ispace = 0; name = xattr_full_name(handler, name); - err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); - if (!err) { - inode->i_ctime = current_time(inode); + if (value && sbinfo->max_inodes) { + ispace = simple_xattr_space(name, size); + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_ispace < ispace) + ispace = 0; + else + sbinfo->free_ispace -= ispace; + raw_spin_unlock(&sbinfo->stat_lock); + if (!ispace) + return -ENOSPC; + } + + old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); + if (!IS_ERR(old_xattr)) { + ispace = 0; + if (old_xattr && sbinfo->max_inodes) + ispace = simple_xattr_space(old_xattr->name, + old_xattr->size); + simple_xattr_free(old_xattr); + old_xattr = NULL; + inode_set_ctime_current(inode); inode_inc_iversion(inode); } - return err; + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_ispace += ispace; + raw_spin_unlock(&sbinfo->stat_lock); + } + return PTR_ERR(old_xattr); } static const struct xattr_handler shmem_security_xattr_handler = { @@ -3330,13 +4365,16 @@ static const struct xattr_handler shmem_trusted_xattr_handler = { .set = shmem_xattr_handler_set, }; -static const struct xattr_handler *shmem_xattr_handlers[] = { -#ifdef CONFIG_TMPFS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif +static const struct xattr_handler shmem_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; + +static const struct xattr_handler * const shmem_xattr_handlers[] = { &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, + &shmem_user_xattr_handler, NULL }; @@ -3349,6 +4387,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) static const struct inode_operations shmem_short_symlink_operations = { .getattr = shmem_getattr, + .setattr = shmem_setattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3357,6 +4396,7 @@ static const struct inode_operations shmem_short_symlink_operations = { static const struct inode_operations shmem_symlink_inode_operations = { .getattr = shmem_getattr, + .setattr = shmem_setattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3384,7 +4424,6 @@ static struct dentry *shmem_find_alias(struct inode *inode) return alias ?: d_find_any_alias(inode); } - static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -3455,6 +4494,17 @@ enum shmem_param { Opt_uid, Opt_inode32, Opt_inode64, + Opt_noswap, + Opt_quota, + Opt_usrquota, + Opt_grpquota, + Opt_usrquota_block_hardlimit, + Opt_usrquota_inode_hardlimit, + Opt_grpquota_block_hardlimit, + Opt_grpquota_inode_hardlimit, + Opt_casefold_version, + Opt_casefold, + Opt_strict_encoding, }; static const struct constant_table shmem_param_enums_huge[] = { @@ -3466,19 +4516,74 @@ static const struct constant_table shmem_param_enums_huge[] = { }; const struct fs_parameter_spec shmem_fs_parameters[] = { - fsparam_u32 ("gid", Opt_gid), + fsparam_gid ("gid", Opt_gid), fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), fsparam_u32oct("mode", Opt_mode), fsparam_string("mpol", Opt_mpol), fsparam_string("nr_blocks", Opt_nr_blocks), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("size", Opt_size), - fsparam_u32 ("uid", Opt_uid), + fsparam_uid ("uid", Opt_uid), fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), + fsparam_flag ("noswap", Opt_noswap), +#ifdef CONFIG_TMPFS_QUOTA + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), + fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), + fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), + fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), +#endif + fsparam_string("casefold", Opt_casefold_version), + fsparam_flag ("casefold", Opt_casefold), + fsparam_flag ("strict_encoding", Opt_strict_encoding), {} }; +#if IS_ENABLED(CONFIG_UNICODE) +static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, + bool latest_version) +{ + struct shmem_options *ctx = fc->fs_private; + int version = UTF8_LATEST; + struct unicode_map *encoding; + char *version_str = param->string + 5; + + if (!latest_version) { + if (strncmp(param->string, "utf8-", 5)) + return invalfc(fc, "Only UTF-8 encodings are supported " + "in the format: utf8-<version number>"); + + version = utf8_parse_version(version_str); + if (version < 0) + return invalfc(fc, "Invalid UTF-8 version: %s", version_str); + } + + encoding = utf8_load(version); + + if (IS_ERR(encoding)) { + return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n", + unicode_major(version), unicode_minor(version), + unicode_rev(version)); + } + + pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n", + unicode_major(version), unicode_minor(version), unicode_rev(version)); + + ctx->encoding = encoding; + + return 0; +} +#else +static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, + bool latest_version) +{ + return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); +} +#endif + static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { struct shmem_options *ctx = fc->fs_private; @@ -3486,6 +4591,8 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) unsigned long long size; char *rest; int opt; + kuid_t kuid; + kgid_t kgid; opt = fs_parse(fc, shmem_fs_parameters, param, &result); if (opt < 0) @@ -3507,13 +4614,13 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); - if (*rest || ctx->blocks > S64_MAX) + if (*rest || ctx->blocks > LONG_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_inodes: ctx->inodes = memparse(param->string, &rest); - if (*rest) + if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) goto bad_value; ctx->seen |= SHMEM_SEEN_INODES; break; @@ -3521,14 +4628,28 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->mode = result.uint_32 & 07777; break; case Opt_uid: - ctx->uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(ctx->uid)) + kuid = result.uid; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, kuid)) goto bad_value; + + ctx->uid = kuid; break; case Opt_gid: - ctx->gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(ctx->gid)) + kgid = result.gid; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, kgid)) goto bad_value; + + ctx->gid = kgid; break; case Opt_huge: ctx->huge = result.uint_32; @@ -3559,6 +4680,78 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->full_inums = true; ctx->seen |= SHMEM_SEEN_INUMS; break; + case Opt_noswap: + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { + return invalfc(fc, + "Turning off swap in unprivileged tmpfs mounts unsupported"); + } + ctx->noswap = true; + break; + case Opt_quota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); + break; + case Opt_usrquota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= QTYPE_MASK_USR; + break; + case Opt_grpquota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= QTYPE_MASK_GRP; + break; + case Opt_usrquota_block_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) + return invalfc(fc, + "User quota block hardlimit too large."); + ctx->qlimits.usrquota_bhardlimit = size; + break; + case Opt_grpquota_block_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) + return invalfc(fc, + "Group quota block hardlimit too large."); + ctx->qlimits.grpquota_bhardlimit = size; + break; + case Opt_usrquota_inode_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_INO_LIMIT) + return invalfc(fc, + "User quota inode hardlimit too large."); + ctx->qlimits.usrquota_ihardlimit = size; + break; + case Opt_grpquota_inode_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_INO_LIMIT) + return invalfc(fc, + "Group quota inode hardlimit too large."); + ctx->qlimits.grpquota_ihardlimit = size; + break; + case Opt_casefold_version: + return shmem_parse_opt_casefold(fc, param, false); + case Opt_casefold: + return shmem_parse_opt_casefold(fc, param, true); + case Opt_strict_encoding: +#if IS_ENABLED(CONFIG_UNICODE) + ctx->strict_encoding = true; + break; +#else + return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); +#endif } return 0; @@ -3568,67 +4761,52 @@ bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } -static int shmem_parse_options(struct fs_context *fc, void *data) +static char *shmem_next_opt(char **s) { - char *options = data; + char *sbegin = *s; + char *p; - if (options) { - int err = security_sb_eat_lsm_opts(options, &fc->security); - if (err) - return err; - } + if (sbegin == NULL) + return NULL; - while (options != NULL) { - char *this_char = options; - for (;;) { - /* - * NUL-terminate this option: unfortunately, - * mount options form a comma-separated list, - * but mpol's nodelist may also contain commas. - */ - options = strchr(options, ','); - if (options == NULL) - break; - options++; - if (!isdigit(*options)) { - options[-1] = '\0'; - break; - } - } - if (*this_char) { - char *value = strchr(this_char, '='); - size_t len = 0; - int err; - - if (value) { - *value++ = '\0'; - len = strlen(value); - } - err = vfs_parse_fs_string(fc, this_char, value, len); - if (err < 0) - return err; + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + for (;;) { + p = strchr(*s, ','); + if (p == NULL) + break; + *s = p + 1; + if (!isdigit(*(p+1))) { + *p = '\0'; + return sbegin; } } - return 0; + + *s = NULL; + return sbegin; +} + +static int shmem_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* * Reconfigure a shmem filesystem. - * - * Note that we disallow change from limited->unlimited blocks/inodes while any - * are in use; but we must separately disallow unlimited->limited, because in - * that case we have no record of how much is already in use. */ static int shmem_reconfigure(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); - unsigned long inodes; + unsigned long used_isp; struct mempolicy *mpol = NULL; const char *err; raw_spin_lock(&sbinfo->stat_lock); - inodes = sbinfo->max_inodes - sbinfo->free_inodes; + used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { @@ -3646,7 +4824,7 @@ static int shmem_reconfigure(struct fs_context *fc) err = "Cannot retroactively limit inodes"; goto out; } - if (ctx->inodes < inodes) { + if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { err = "Too few inodes for current use"; goto out; } @@ -3658,6 +4836,33 @@ static int shmem_reconfigure(struct fs_context *fc) goto out; } + /* + * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap" + * counterpart for (re-)enabling swap. + */ + if (ctx->noswap && !sbinfo->noswap) { + err = "Cannot disable swap on remount"; + goto out; + } + + if (ctx->seen & SHMEM_SEEN_QUOTA && + !sb_any_quota_loaded(fc->root->d_sb)) { + err = "Cannot enable quota on remount"; + goto out; + } + +#ifdef CONFIG_TMPFS_QUOTA +#define CHANGED_LIMIT(name) \ + (ctx->qlimits.name## hardlimit && \ + (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) + + if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || + CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { + err = "Cannot change global quota limit on remount"; + goto out; + } +#endif /* CONFIG_TMPFS_QUOTA */ + if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; if (ctx->seen & SHMEM_SEEN_INUMS) @@ -3666,7 +4871,7 @@ static int shmem_reconfigure(struct fs_context *fc) sbinfo->max_blocks = ctx->blocks; if (ctx->seen & SHMEM_SEEN_INODES) { sbinfo->max_inodes = ctx->inodes; - sbinfo->free_inodes = ctx->inodes - inodes; + sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; } /* @@ -3677,6 +4882,10 @@ static int shmem_reconfigure(struct fs_context *fc) sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } + + if (ctx->noswap) + sbinfo->noswap = true; + raw_spin_unlock(&sbinfo->stat_lock); mpol_put(mpol); return 0; @@ -3688,10 +4897,10 @@ out: static int shmem_show_options(struct seq_file *seq, struct dentry *root) { struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); + struct mempolicy *mpol; if (sbinfo->max_blocks != shmem_default_max_blocks()) - seq_printf(seq, ",size=%luk", - sbinfo->max_blocks << (PAGE_SHIFT - 10)); + seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (0777 | S_ISVTX)) @@ -3730,7 +4939,29 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); #endif - shmem_show_mpol(seq, sbinfo->mpol); + mpol = shmem_get_sbmpol(sbinfo); + shmem_show_mpol(seq, mpol); + mpol_put(mpol); + if (sbinfo->noswap) + seq_printf(seq, ",noswap"); +#ifdef CONFIG_TMPFS_QUOTA + if (sb_has_quota_active(root->d_sb, USRQUOTA)) + seq_printf(seq, ",usrquota"); + if (sb_has_quota_active(root->d_sb, GRPQUOTA)) + seq_printf(seq, ",grpquota"); + if (sbinfo->qlimits.usrquota_bhardlimit) + seq_printf(seq, ",usrquota_block_hardlimit=%lld", + sbinfo->qlimits.usrquota_bhardlimit); + if (sbinfo->qlimits.grpquota_bhardlimit) + seq_printf(seq, ",grpquota_block_hardlimit=%lld", + sbinfo->qlimits.grpquota_bhardlimit); + if (sbinfo->qlimits.usrquota_ihardlimit) + seq_printf(seq, ",usrquota_inode_hardlimit=%lld", + sbinfo->qlimits.usrquota_ihardlimit); + if (sbinfo->qlimits.grpquota_ihardlimit) + seq_printf(seq, ",grpquota_inode_hardlimit=%lld", + sbinfo->qlimits.grpquota_ihardlimit); +#endif return 0; } @@ -3740,6 +4971,14 @@ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); +#if IS_ENABLED(CONFIG_UNICODE) + if (sb->s_encoding) + utf8_unload(sb->s_encoding); +#endif + +#ifdef CONFIG_TMPFS_QUOTA + shmem_disable_quotas(sb); +#endif free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); @@ -3747,17 +4986,25 @@ static void shmem_put_super(struct super_block *sb) sb->s_fs_info = NULL; } +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS) +static const struct dentry_operations shmem_ci_dentry_ops = { + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, +}; +#endif + static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; + int error = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) - return -ENOMEM; + return error; sb->s_fs_info = sbinfo; @@ -3774,16 +5021,35 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) ctx->inodes = shmem_default_max_inodes(); if (!(ctx->seen & SHMEM_SEEN_INUMS)) ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); + sbinfo->noswap = ctx->noswap; } else { sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC | SB_I_VERSION; + sb->s_flags |= SB_NOSEC; + +#if IS_ENABLED(CONFIG_UNICODE) + if (!ctx->encoding && ctx->strict_encoding) { + pr_err("tmpfs: strict_encoding option without encoding is forbidden\n"); + error = -EINVAL; + goto failed; + } + + if (ctx->encoding) { + sb->s_encoding = ctx->encoding; + set_default_d_op(sb, &shmem_ci_dentry_ops); + if (ctx->strict_encoding) + sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; + } +#endif + #else sb->s_flags |= SB_NOUSER; -#endif +#endif /* CONFIG_TMPFS */ + sb->s_d_flags |= DCACHE_DONTCACHE; sbinfo->max_blocks = ctx->blocks; - sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; + sbinfo->max_inodes = ctx->inodes; + sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; if (sb->s_flags & SB_KERNMOUNT) { sbinfo->ino_batch = alloc_percpu(ino_t); if (!sbinfo->ino_batch) @@ -3793,7 +5059,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; - sbinfo->huge = ctx->huge; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + else + sbinfo->huge = tmpfs_huge; +#endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; @@ -3815,11 +5086,31 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif - uuid_gen(&sb->s_uuid); + uuid_t uuid; + uuid_gen(&uuid); + super_set_uuid(sb, uuid.b, sizeof(uuid)); + +#ifdef CONFIG_TMPFS_QUOTA + if (ctx->seen & SHMEM_SEEN_QUOTA) { + sb->dq_op = &shmem_quota_operations; + sb->s_qcop = &dquot_quotactl_sysfile_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; - inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); - if (!inode) + /* Copy the default limits from ctx into sbinfo */ + memcpy(&sbinfo->qlimits, &ctx->qlimits, + sizeof(struct shmem_quota_limits)); + + if (shmem_enable_quotas(sb, ctx->quota_types)) + goto failed; + } +#endif /* CONFIG_TMPFS_QUOTA */ + + inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, + S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); goto failed; + } inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; sb->s_root = d_make_root(inode); @@ -3829,7 +5120,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) failed: shmem_put_super(sb); - return -ENOMEM; + return error; } static int shmem_get_tree(struct fs_context *fc) @@ -3851,13 +5142,13 @@ static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS - .parse_monolithic = shmem_parse_options, + .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif }; -static struct kmem_cache *shmem_inode_cachep; +static struct kmem_cache *shmem_inode_cachep __ro_after_init; static struct inode *shmem_alloc_inode(struct super_block *sb) { @@ -3879,6 +5170,8 @@ static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); + if (S_ISDIR(inode->i_mode)) + simple_offset_destroy(shmem_get_offset_ctx(inode)); } static void shmem_init_inode(void *foo) @@ -3887,27 +5180,26 @@ static void shmem_init_inode(void *foo) inode_init_once(&info->vfs_inode); } -static void shmem_init_inodecache(void) +static void __init shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); } -static void shmem_destroy_inodecache(void) +static void __init shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } /* Keep the page in page cache instead of truncating it */ -static int shmem_error_remove_page(struct address_space *mapping, - struct page *page) +static int shmem_error_remove_folio(struct address_space *mapping, + struct folio *folio) { return 0; } -const struct address_space_operations shmem_aops = { - .writepage = shmem_writepage, +static const struct address_space_operations shmem_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, @@ -3916,20 +5208,19 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif - .error_remove_page = shmem_error_remove_page, + .error_remove_folio = shmem_error_remove_folio, }; -EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { - .mmap = shmem_mmap, - .open = generic_file_open, + .mmap_prepare = shmem_mmap_prepare, + .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, - .write_iter = generic_file_write_iter, + .write_iter = shmem_file_write_iter, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = shmem_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif @@ -3959,6 +5250,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .mknod = shmem_mknod, .rename = shmem_rename2, .tmpfile = shmem_tmpfile, + .get_offset_ctx = shmem_get_offset_ctx, #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3990,8 +5282,11 @@ static const struct super_operations shmem_ops = { .statfs = shmem_statfs, .show_options = shmem_show_options, #endif +#ifdef CONFIG_TMPFS_QUOTA + .get_dquots = shmem_get_dquots, +#endif .evict_inode = shmem_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, @@ -4029,8 +5324,15 @@ int shmem_init_fs_context(struct fs_context *fc) ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); +#if IS_ENABLED(CONFIG_UNICODE) + ctx->encoding = NULL; +#endif + fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; +#ifdef CONFIG_TMPFS + fc->sb_flags |= SB_I_VERSION; +#endif return 0; } @@ -4041,16 +5343,80 @@ static struct file_system_type shmem_fs_type = { #ifdef CONFIG_TMPFS .parameters = shmem_fs_parameters, #endif - .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, + .kill_sb = kill_anon_super, + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, +}; + +#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define TMPFS_ATTR_W(_name, _store) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + +#define TMPFS_ATTR_RW(_name, _show, _store) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define TMPFS_ATTR_RO(_name, _show) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#if IS_ENABLED(CONFIG_UNICODE) +static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a, + char *buf) +{ + return sysfs_emit(buf, "supported\n"); +} +TMPFS_ATTR_RO(casefold, casefold_show); +#endif + +static struct attribute *tmpfs_attributes[] = { +#if IS_ENABLED(CONFIG_UNICODE) + &tmpfs_attr_casefold.attr, +#endif + NULL +}; + +static const struct attribute_group tmpfs_attribute_group = { + .attrs = tmpfs_attributes, + .name = "features" }; +static struct kobject *tmpfs_kobj; + +static int __init tmpfs_sysfs_init(void) +{ + int ret; + + tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj); + if (!tmpfs_kobj) + return -ENOMEM; + + ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group); + if (ret) + kobject_put(tmpfs_kobj); + + return ret; +} +#endif /* CONFIG_SYSFS && CONFIG_TMPFS */ + void __init shmem_init(void) { int error; shmem_init_inodecache(); +#ifdef CONFIG_TMPFS_QUOTA + register_quota_format(&shmem_quota_format); +#endif + error = register_filesystem(&shmem_fs_type); if (error) { pr_err("Could not register tmpfs\n"); @@ -4064,17 +5430,35 @@ void __init shmem_init(void) goto out1; } +#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) + error = tmpfs_sysfs_init(); + if (error) { + pr_err("Could not init tmpfs sysfs\n"); + goto out1; + } +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ + + /* + * Default to setting PMD-sized THP to inherit the global setting and + * disable all other multi-size THPs. + */ + if (!shmem_orders_configured) + huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; out1: unregister_filesystem(&shmem_fs_type); out2: +#ifdef CONFIG_TMPFS_QUOTA + unregister_quota_format(&shmem_quota_format); +#endif shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); } @@ -4096,11 +5480,9 @@ static ssize_t shmem_enabled_show(struct kobject *kobj, for (i = 0; i < ARRAY_SIZE(values); i++) { len += sysfs_emit_at(buf, len, - shmem_huge == values[i] ? "%s[%s]" : "%s%s", - i ? " " : "", - shmem_format_huge(values[i])); + shmem_huge == values[i] ? "%s[%s]" : "%s%s", + i ? " " : "", shmem_format_huge(values[i])); } - len += sysfs_emit_at(buf, len, "\n"); return len; @@ -4110,7 +5492,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { char tmp[16]; - int huge; + int huge, err; if (count + 1 > sizeof(tmp)) return -EINVAL; @@ -4121,20 +5503,238 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, huge = shmem_parse_huge(tmp); if (huge == -EINVAL) - return -EINVAL; - if (!has_transparent_hugepage() && - huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) - return -EINVAL; + return huge; shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; - return count; + + err = start_stop_khugepaged(); + return err ? err : count; } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); +static DEFINE_SPINLOCK(huge_shmem_orders_lock); + +static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_shmem_orders_always)) + output = "[always] inherit within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_inherit)) + output = "always [inherit] within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_within_size)) + output = "always inherit [within_size] advise never"; + else if (test_bit(order, &huge_shmem_orders_madvise)) + output = "always inherit within_size [advise] never"; + else + output = "always inherit within_size advise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_always); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "inherit")) { + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (shmem_huge == SHMEM_HUGE_FORCE && + order != HPAGE_PMD_ORDER) + return -EINVAL; + + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_inherit); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "within_size")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + set_bit(order, &huge_shmem_orders_within_size); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "advise")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + clear_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else { + ret = -EINVAL; + } + + if (ret > 0) { + int err = start_stop_khugepaged(); + + if (err) + ret = err; + } + return ret; +} + +struct kobj_attribute thpsize_shmem_enabled_attr = + __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) + +static int __init setup_transparent_hugepage_shmem(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge == -EINVAL) { + pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n"); + return huge; + } + + shmem_huge = huge; + return 1; +} +__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); + +static int __init setup_transparent_hugepage_tmpfs(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge < 0) { + pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); + return huge; + } + + tmpfs_huge = huge; + return 1; +} +__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); + +static char str_dup[PAGE_SIZE] __initdata; +static int __init setup_thp_shmem(char *str) +{ + char *token, *range, *policy, *subtoken; + unsigned long always, inherit, madvise, within_size; + char *start_size, *end_size; + int start, end, nr; + char *p; + + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strscpy(str_dup, str); + + always = huge_shmem_orders_always; + inherit = huge_shmem_orders_inherit; + madvise = huge_shmem_orders_madvise; + within_size = huge_shmem_orders_within_size; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; + + if (!policy) + goto err; + + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; + + start = get_order_from_str(start_size, + THP_ORDERS_ALL_FILE_DEFAULT); + end = get_order_from_str(end_size, + THP_ORDERS_ALL_FILE_DEFAULT); + } else { + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_FILE_DEFAULT); + } + + if (start < 0) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + start_size); + goto err; + } + + if (end < 0) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + end_size); + goto err; + } + + if (start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "advise")) { + bitmap_set(&madvise, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "inherit")) { + bitmap_set(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "within_size")) { + bitmap_set(&within_size, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + } else if (!strcmp(policy, "never")) { + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else { + pr_err("invalid policy %s in thp_shmem boot parameter\n", policy); + goto err; + } + } + } + + huge_shmem_orders_always = always; + huge_shmem_orders_madvise = madvise; + huge_shmem_orders_inherit = inherit; + huge_shmem_orders_within_size = within_size; + shmem_orders_configured = true; + return 1; + +err: + pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str); + return 0; +} +__setup("thp_shmem=", setup_thp_shmem); + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #else /* !CONFIG_SHMEM */ /* @@ -4150,7 +5750,7 @@ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .init_fs_context = ramfs_init_fs_context, .parameters = ramfs_fs_parameters, - .kill_sb = kill_litter_super, + .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; @@ -4181,11 +5781,11 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #endif -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } @@ -4194,17 +5794,26 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations -#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); + return inode ? inode : ERR_PTR(-ENOSPC); +} + #endif /* CONFIG_SHMEM */ /* common code */ -static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, - unsigned long flags, unsigned int i_flags) +static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, + loff_t size, unsigned long vm_flags, + unsigned int i_flags) { + unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; @@ -4214,14 +5823,17 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); + if (is_idmapped_mnt(mnt)) + return ERR_PTR(-EINVAL); + if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); - inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, - flags); - if (unlikely(!inode)) { + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, + S_IFREG | S_IRWXUGO, 0, vm_flags); + if (IS_ERR(inode)) { shmem_unacct_size(flags, size); - return ERR_PTR(-ENOSPC); + return ERR_CAST(inode); } inode->i_flags |= i_flags; inode->i_size = size; @@ -4241,7 +5853,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -4249,10 +5861,11 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } +EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -4265,7 +5878,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -4276,14 +5889,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); -/** - * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap - */ -int shmem_zero_setup(struct vm_area_struct *vma) +static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags) { - struct file *file; - loff_t size = vma->vm_end - vma->vm_start; + loff_t size = end - start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict @@ -4291,7 +5899,18 @@ int shmem_zero_setup(struct vm_area_struct *vma) * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ - file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); + return shmem_kernel_file_setup("dev/zero", size, vm_flags); +} + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap + * Returns: 0 on success, or error + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags); + if (IS_ERR(file)) return PTR_ERR(file); @@ -4304,9 +5923,28 @@ int shmem_zero_setup(struct vm_area_struct *vma) } /** - * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. - * @mapping: the page's address_space - * @index: the page index + * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA + * descriptor for convenience. + * @desc: Describes VMA + * Returns: 0 on success, or error + */ +int shmem_zero_setup_desc(struct vm_area_desc *desc) +{ + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags); + + if (IS_ERR(file)) + return PTR_ERR(file); + + desc->vm_file = file; + desc->vm_ops = &shmem_anon_vm_ops; + + return 0; +} + +/** + * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. + * @mapping: the folio's address_space + * @index: the folio index * @gfp: the page allocator flags to use if allocating * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", @@ -4318,22 +5956,39 @@ int shmem_zero_setup(struct vm_area_struct *vma) * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ -struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, - pgoff_t index, gfp_t gfp) +struct folio *shmem_read_folio_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; struct folio *folio; - struct page *page; int error; - BUG_ON(!shmem_mapping(mapping)); - error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, - gfp, NULL, NULL, NULL); + error = shmem_get_folio_gfp(inode, index, i_size_read(inode), + &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); folio_unlock(folio); + return folio; +#else + /* + * The tiny !SHMEM case uses ramfs without swap + */ + return mapping_read_folio_gfp(mapping, index, gfp); +#endif +} +EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); + +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); + struct page *page; + + if (IS_ERR(folio)) + return &folio->page; + page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); @@ -4341,11 +5996,5 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, } return page; -#else - /* - * The tiny !SHMEM case uses ramfs without swap - */ - return read_cache_page_gfp(mapping, index, gfp); -#endif } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); |
