diff options
Diffstat (limited to 'mm/shmem.c')
| -rw-r--r-- | mm/shmem.c | 6277 |
1 files changed, 4619 insertions, 1658 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index a87990cf9f94..3f194c9842a8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,12 +28,22 @@ #include <linux/ramfs.h> #include <linux/pagemap.h> #include <linux/file.h> +#include <linux/fileattr.h> #include <linux/mm.h> +#include <linux/random.h> +#include <linux/sched/signal.h> #include <linux/export.h> +#include <linux/shmem_fs.h> #include <linux/swap.h> -#include <linux/aio.h> +#include <linux/uio.h> +#include <linux/hugetlb.h> +#include <linux/fs_parser.h> +#include <linux/swapfile.h> +#include <linux/iversion.h> +#include <linux/unicode.h> +#include "swap.h" -static struct vfsmount *shm_mnt; +static struct vfsmount *shm_mnt __ro_after_init; #ifdef CONFIG_SHMEM /* @@ -45,20 +55,18 @@ static struct vfsmount *shm_mnt; #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> -#include <linux/generic_acl.h> +#include <linux/posix_acl_xattr.h> #include <linux/mman.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> -#include <linux/shmem_fs.h> #include <linux/writeback.h> -#include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/percpu_counter.h> #include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/mempolicy.h> #include <linux/namei.h> #include <linux/ctype.h> @@ -66,64 +74,92 @@ static struct vfsmount *shm_mnt; #include <linux/highmem.h> #include <linux/seq_file.h> #include <linux/magic.h> +#include <linux/syscalls.h> +#include <linux/fcntl.h> +#include <uapi/linux/memfd.h> +#include <linux/rmap.h> +#include <linux/uuid.h> +#include <linux/quotaops.h> +#include <linux/rcupdate_wait.h> -#include <asm/uaccess.h> -#include <asm/pgtable.h> +#include <linux/uaccess.h> -#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) -#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) +#include "internal.h" + +#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 +/* Pretend that one inode + its dentry occupy this much memory */ +#define BOGO_INODE_SIZE 1024 + /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate and shmem_writepage communicate via inode->i_private - * (with i_mutex making sure that it has only one user at a time): - * we would prefer not to enlarge the shmem inode just for that. + * shmem_fallocate communicates with shmem_fault or shmem_writeout via + * inode->i_private (with i_rwsem making sure that it has only one user at + * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { + wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ - pgoff_t nr_unswapped; /* how often writepage refused to swap out */ + pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; -/* Flag allocation requirements to shmem_getpage */ -enum sgp_type { - SGP_READ, /* don't exceed i_size, don't allocate page */ - SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ - SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ - SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ +struct shmem_options { + unsigned long long blocks; + unsigned long long inodes; + struct mempolicy *mpol; + kuid_t uid; + kgid_t gid; + umode_t mode; + bool full_inums; + int huge; + int seen; + bool noswap; + unsigned short quota_types; + struct shmem_quota_limits qlimits; +#if IS_ENABLED(CONFIG_UNICODE) + struct unicode_map *encoding; + bool strict_encoding; +#endif +#define SHMEM_SEEN_BLOCKS 1 +#define SHMEM_SEEN_INODES 2 +#define SHMEM_SEEN_HUGE 4 +#define SHMEM_SEEN_INUMS 8 +#define SHMEM_SEEN_QUOTA 16 }; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long huge_shmem_orders_always __read_mostly; +static unsigned long huge_shmem_orders_madvise __read_mostly; +static unsigned long huge_shmem_orders_inherit __read_mostly; +static unsigned long huge_shmem_orders_within_size __read_mostly; +static bool shmem_orders_configured __initdata; +#endif + #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { - return totalram_pages / 2; + return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { - return min(totalram_pages - totalhigh_pages, totalram_pages / 2); + unsigned long nr_pages = totalram_pages(); + + return min3(nr_pages - totalhigh_pages(), nr_pages / 2, + ULONG_MAX / BOGO_INODE_SIZE); } #endif -static bool shmem_should_replace_page(struct page *page, gfp_t gfp); -static int shmem_replace_page(struct page **pagep, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index); -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); - -static inline int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, int *fault_type) -{ - return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), fault_type); -} +static int shmem_swapin_folio(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -138,32 +174,94 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { - return (flags & VM_NORESERVE) ? + return (flags & SHMEM_F_NORESERVE) ? 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { - if (!(flags & VM_NORESERVE)) + if (!(flags & SHMEM_F_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } +static inline int shmem_reacct_size(unsigned long flags, + loff_t oldsize, loff_t newsize) +{ + if (!(flags & SHMEM_F_NORESERVE)) { + if (VM_ACCT(newsize) > VM_ACCT(oldsize)) + return security_vm_enough_memory_mm(current->mm, + VM_ACCT(newsize) - VM_ACCT(oldsize)); + else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) + vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); + } + return 0; +} + /* * ... whereas tmpfs objects are accounted incrementally as - * pages are allocated, in order to allow huge sparse files. - * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * pages are allocated, in order to allow large sparse files. + * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ -static inline int shmem_acct_block(unsigned long flags) +static inline int shmem_acct_blocks(unsigned long flags, long pages) { - return (flags & VM_NORESERVE) ? - security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0; + if (!(flags & SHMEM_F_NORESERVE)) + return 0; + + return security_vm_enough_memory_mm(current->mm, + pages * VM_ACCT(PAGE_SIZE)); } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { - if (flags & VM_NORESERVE) - vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); + if (flags & SHMEM_F_NORESERVE) + vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); +} + +int shmem_inode_acct_blocks(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + int err = -ENOSPC; + + if (shmem_acct_blocks(info->flags, pages)) + return err; + + might_sleep(); /* when quotas */ + if (sbinfo->max_blocks) { + if (!percpu_counter_limited_add(&sbinfo->used_blocks, + sbinfo->max_blocks, pages)) + goto unacct; + + err = dquot_alloc_block_nodirty(inode, pages); + if (err) { + percpu_counter_sub(&sbinfo->used_blocks, pages); + goto unacct; + } + } else { + err = dquot_alloc_block_nodirty(inode, pages); + if (err) + goto unacct; + } + + return 0; + +unacct: + shmem_unacct_blocks(info->flags, pages); + return err; +} + +static void shmem_inode_unacct_blocks(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + might_sleep(); /* when quotas */ + dquot_free_block_nodirty(inode, pages); + + if (sbinfo->max_blocks) + percpu_counter_sub(&sbinfo->used_blocks, pages); + shmem_unacct_blocks(info->flags, pages); } static const struct super_operations shmem_ops; @@ -173,43 +271,160 @@ static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_anon_vm_ops; +static struct file_system_type shmem_fs_type; -static struct backing_dev_info shmem_backing_dev_info __read_mostly = { - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, -}; +bool shmem_mapping(const struct address_space *mapping) +{ + return mapping->a_ops == &shmem_aops; +} +EXPORT_SYMBOL_GPL(shmem_mapping); + +bool vma_is_anon_shmem(const struct vm_area_struct *vma) +{ + return vma->vm_ops == &shmem_anon_vm_ops; +} + +bool vma_is_shmem(const struct vm_area_struct *vma) +{ + return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; +} static LIST_HEAD(shmem_swaplist); -static DEFINE_MUTEX(shmem_swaplist_mutex); +static DEFINE_SPINLOCK(shmem_swaplist_lock); -static int shmem_reserve_inode(struct super_block *sb) +#ifdef CONFIG_TMPFS_QUOTA + +static int shmem_enable_quotas(struct super_block *sb, + unsigned short quota_types) +{ + int type, err = 0; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; + for (type = 0; type < SHMEM_MAXQUOTAS; type++) { + if (!(quota_types & (1 << type))) + continue; + err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, + DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); + if (err) + goto out_err; + } + return 0; + +out_err: + pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", + type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + return err; +} + +static void shmem_disable_quotas(struct super_block *sb) +{ + int type; + + for (type = 0; type < SHMEM_MAXQUOTAS; type++) + dquot_quota_off(sb, type); +} + +static struct dquot __rcu **shmem_get_dquots(struct inode *inode) +{ + return SHMEM_I(inode)->i_dquot; +} +#endif /* CONFIG_TMPFS_QUOTA */ + +/* + * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and + * produces a novel ino for the newly allocated inode. + * + * It may also be called when making a hard link to permit the space needed by + * each dentry. However, in that case, no new inode number is needed since that + * internally draws from another pool of inode numbers (currently global + * get_next_ino()). This case is indicated by passing NULL as inop. + */ +#define SHMEM_INO_BATCH 1024 +static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return -ENOSPC; + ino_t ino; + + if (!(sb->s_flags & SB_KERNMOUNT)) { + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->max_inodes) { + if (sbinfo->free_ispace < BOGO_INODE_SIZE) { + raw_spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_ispace -= BOGO_INODE_SIZE; } - sbinfo->free_inodes--; - spin_unlock(&sbinfo->stat_lock); + if (inop) { + ino = sbinfo->next_ino++; + if (unlikely(is_zero_ino(ino))) + ino = sbinfo->next_ino++; + if (unlikely(!sbinfo->full_inums && + ino > UINT_MAX)) { + /* + * Emulate get_next_ino uint wraparound for + * compatibility + */ + if (IS_ENABLED(CONFIG_64BIT)) + pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", + __func__, MINOR(sb->s_dev)); + sbinfo->next_ino = 1; + ino = sbinfo->next_ino++; + } + *inop = ino; + } + raw_spin_unlock(&sbinfo->stat_lock); + } else if (inop) { + /* + * __shmem_file_setup, one of our callers, is lock-free: it + * doesn't hold stat_lock in shmem_reserve_inode since + * max_inodes is always 0, and is called from potentially + * unknown contexts. As such, use a per-cpu batched allocator + * which doesn't require the per-sb stat_lock unless we are at + * the batch boundary. + * + * We don't need to worry about inode{32,64} since SB_KERNMOUNT + * shmem mounts are not exposed to userspace, so we don't need + * to worry about things like glibc compatibility. + */ + ino_t *next_ino; + + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); + ino = *next_ino; + if (unlikely(ino % SHMEM_INO_BATCH == 0)) { + raw_spin_lock(&sbinfo->stat_lock); + ino = sbinfo->next_ino; + sbinfo->next_ino += SHMEM_INO_BATCH; + raw_spin_unlock(&sbinfo->stat_lock); + if (unlikely(is_zero_ino(ino))) + ino++; + } + *inop = ino; + *next_ino = ++ino; + put_cpu(); } + return 0; } -static void shmem_free_inode(struct super_block *sb) +static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; + raw_spin_unlock(&sbinfo->stat_lock); } } /** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc + * @alloced: the change in number of pages allocated to inode + * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. @@ -217,198 +432,613 @@ static void shmem_free_inode(struct super_block *sb) * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) * - * It has to be called with the spinlock held. + * Return: true if swapped was incremented from 0, for shmem_writeout(). */ -static void shmem_recalc_inode(struct inode *inode) +bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); + bool first_swapped = false; long freed; - freed = info->alloced - info->swapped - inode->i_mapping->nrpages; - if (freed > 0) { - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -freed); - info->alloced -= freed; - inode->i_blocks -= freed * BLOCKS_PER_PAGE; - shmem_unacct_blocks(info->flags, freed); + spin_lock(&info->lock); + info->alloced += alloced; + info->swapped += swapped; + freed = info->alloced - info->swapped - + READ_ONCE(inode->i_mapping->nrpages); + /* + * Special case: whereas normally shmem_recalc_inode() is called + * after i_mapping->nrpages has already been adjusted (up or down), + * shmem_writeout() has to raise swapped before nrpages is lowered - + * to stop a racing shmem_recalc_inode() from thinking that a page has + * been freed. Compensate here, to avoid the need for a followup call. + */ + if (swapped > 0) { + if (info->swapped == swapped) + first_swapped = true; + freed += swapped; } + if (freed > 0) + info->alloced -= freed; + spin_unlock(&info->lock); + + /* The quota case may block */ + if (freed > 0) + shmem_inode_unacct_blocks(inode, freed); + return first_swapped; +} + +bool shmem_charge(struct inode *inode, long pages) +{ + struct address_space *mapping = inode->i_mapping; + + if (shmem_inode_acct_blocks(inode, pages)) + return false; + + /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ + xa_lock_irq(&mapping->i_pages); + mapping->nrpages += pages; + xa_unlock_irq(&mapping->i_pages); + + shmem_recalc_inode(inode, pages, 0); + return true; +} + +void shmem_uncharge(struct inode *inode, long pages) +{ + /* pages argument is currently unused: keep it to help debugging */ + /* nrpages adjustment done by __filemap_remove_folio() or caller */ + + shmem_recalc_inode(inode, 0, 0); } /* - * Replace item expected in radix tree by a new item, while holding tree lock. + * Replace item expected in xarray by a new item, while holding xa_lock. */ -static int shmem_radix_tree_replace(struct address_space *mapping, +static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { - void **pslot; - void *item = NULL; + XA_STATE(xas, &mapping->i_pages, index); + void *item; VM_BUG_ON(!expected); - pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (pslot) - item = radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock); + VM_BUG_ON(!replacement); + item = xas_load(&xas); if (item != expected) return -ENOENT; - if (replacement) - radix_tree_replace_slot(pslot, replacement); - else - radix_tree_delete(&mapping->page_tree, index); + xas_store(&xas, replacement); return 0; } /* * Sometimes, before we decide whether to proceed or to fail, we must check - * that an entry was not already brought back from swap by a racing thread. + * that an entry was not already brought back or split by a racing thread. * - * Checking page is not enough: by the time a SwapCache page is locked, it - * might be reused, and again be SwapCache, using the same swap as before. + * Checking folio is not enough: by the time a swapcache folio is locked, it + * might be reused, and again be swapcache, using the same swap as before. + * Returns the swap entry's order if it still presents, else returns -1. */ -static bool shmem_confirm_swap(struct address_space *mapping, - pgoff_t index, swp_entry_t swap) +static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, + swp_entry_t swap) { - void *item; + XA_STATE(xas, &mapping->i_pages, index); + int ret = -1; + void *entry; rcu_read_lock(); - item = radix_tree_lookup(&mapping->page_tree, index); + do { + entry = xas_load(&xas); + if (entry == swp_to_radix_entry(swap)) + ret = xas_get_order(&xas); + } while (xas_retry(&xas, entry)); rcu_read_unlock(); - return item == swp_to_radix_entry(swap); + return ret; } /* - * Like add_to_page_cache_locked, but error if expected item has gone. + * Definitions for "huge tmpfs": tmpfs mounted with the huge= option + * + * SHMEM_HUGE_NEVER: + * disables huge pages for the mount; + * SHMEM_HUGE_ALWAYS: + * enables huge pages for the mount; + * SHMEM_HUGE_WITHIN_SIZE: + * only allocate huge pages if the page will be fully within i_size, + * also respect madvise() hints; + * SHMEM_HUGE_ADVISE: + * only allocate huge pages if requested with madvise(); */ -static int shmem_add_to_page_cache(struct page *page, - struct address_space *mapping, - pgoff_t index, gfp_t gfp, void *expected) + +#define SHMEM_HUGE_NEVER 0 +#define SHMEM_HUGE_ALWAYS 1 +#define SHMEM_HUGE_WITHIN_SIZE 2 +#define SHMEM_HUGE_ADVISE 3 + +/* + * Special values. + * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: + * + * SHMEM_HUGE_DENY: + * disables huge on shm_mnt and all mounts, for emergency use; + * SHMEM_HUGE_FORCE: + * enables huge on shm_mnt and all mounts, w/o needing option, for testing; + * + */ +#define SHMEM_HUGE_DENY (-1) +#define SHMEM_HUGE_FORCE (-2) + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* ifdef here to avoid bloating shmem.o when not necessary */ + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE) +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE +#else +#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER +#endif + +static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT; + +#undef SHMEM_HUGE_DEFAULT + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE +#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE) +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE +#else +#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER +#endif + +static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT; + +#undef TMPFS_HUGE_DEFAULT + +static unsigned int shmem_get_orders_within_size(struct inode *inode, + unsigned long within_size_orders, pgoff_t index, + loff_t write_end) { - int error; + pgoff_t aligned_index; + unsigned long order; + loff_t i_size; + + order = highest_order(within_size_orders); + while (within_size_orders) { + aligned_index = round_up(index + 1, 1 << order); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= aligned_index) + return within_size_orders; + + order = next_order(&within_size_orders, order); + } + + return 0; +} + +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? + 0 : BIT(HPAGE_PMD_ORDER); + unsigned long within_size_orders; - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(!PageSwapBacked(page)); + if (!S_ISREG(inode->i_mode)) + return 0; + if (shmem_huge == SHMEM_HUGE_DENY) + return 0; + if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) + return maybe_pmd_order; - page_cache_get(page); - page->mapping = mapping; - page->index = index; + /* + * The huge order allocation for anon shmem is controlled through + * the mTHP interface, so we still use PMD-sized huge order to + * check whether global control is enabled. + * + * For tmpfs with 'huge=always' or 'huge=within_size' mount option, + * we will always try PMD-sized order first. If that failed, it will + * fall back to small large folios. + */ + switch (SHMEM_SB(inode->i_sb)->huge) { + case SHMEM_HUGE_ALWAYS: + return THP_ORDERS_ALL_FILE_DEFAULT; + case SHMEM_HUGE_WITHIN_SIZE: + within_size_orders = shmem_get_orders_within_size(inode, + THP_ORDERS_ALL_FILE_DEFAULT, index, write_end); + if (within_size_orders > 0) + return within_size_orders; + + fallthrough; + case SHMEM_HUGE_ADVISE: + if (vm_flags & VM_HUGEPAGE) + return THP_ORDERS_ALL_FILE_DEFAULT; + fallthrough; + default: + return 0; + } +} - spin_lock_irq(&mapping->tree_lock); - if (!expected) - error = radix_tree_insert(&mapping->page_tree, index, page); +static int shmem_parse_huge(const char *str) +{ + int huge; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "never")) + huge = SHMEM_HUGE_NEVER; + else if (!strcmp(str, "always")) + huge = SHMEM_HUGE_ALWAYS; + else if (!strcmp(str, "within_size")) + huge = SHMEM_HUGE_WITHIN_SIZE; + else if (!strcmp(str, "advise")) + huge = SHMEM_HUGE_ADVISE; + else if (!strcmp(str, "deny")) + huge = SHMEM_HUGE_DENY; + else if (!strcmp(str, "force")) + huge = SHMEM_HUGE_FORCE; else - error = shmem_radix_tree_replace(mapping, index, expected, - page); - if (!error) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); - } else { - page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); + return -EINVAL; + + if (!has_transparent_hugepage() && + huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) + return -EINVAL; + + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (huge == SHMEM_HUGE_FORCE && + huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) + return -EINVAL; + + return huge; +} + +#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) +static const char *shmem_format_huge(int huge) +{ + switch (huge) { + case SHMEM_HUGE_NEVER: + return "never"; + case SHMEM_HUGE_ALWAYS: + return "always"; + case SHMEM_HUGE_WITHIN_SIZE: + return "within_size"; + case SHMEM_HUGE_ADVISE: + return "advise"; + case SHMEM_HUGE_DENY: + return "deny"; + case SHMEM_HUGE_FORCE: + return "force"; + default: + VM_BUG_ON(1); + return "bad_val"; } - return error; +} +#endif + +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_free) +{ + LIST_HEAD(list), *pos, *next; + struct inode *inode; + struct shmem_inode_info *info; + struct folio *folio; + unsigned long batch = sc ? sc->nr_to_scan : 128; + unsigned long split = 0, freed = 0; + + if (list_empty(&sbinfo->shrinklist)) + return SHRINK_STOP; + + spin_lock(&sbinfo->shrinklist_lock); + list_for_each_safe(pos, next, &sbinfo->shrinklist) { + info = list_entry(pos, struct shmem_inode_info, shrinklist); + + /* pin the inode */ + inode = igrab(&info->vfs_inode); + + /* inode is about to be evicted */ + if (!inode) { + list_del_init(&info->shrinklist); + goto next; + } + + list_move(&info->shrinklist, &list); +next: + sbinfo->shrinklist_len--; + if (!--batch) + break; + } + spin_unlock(&sbinfo->shrinklist_lock); + + list_for_each_safe(pos, next, &list) { + pgoff_t next, end; + loff_t i_size; + int ret; + + info = list_entry(pos, struct shmem_inode_info, shrinklist); + inode = &info->vfs_inode; + + if (nr_to_free && freed >= nr_to_free) + goto move_back; + + i_size = i_size_read(inode); + folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); + if (!folio || xa_is_value(folio)) + goto drop; + + /* No large folio at the end of the file: nothing to split */ + if (!folio_test_large(folio)) { + folio_put(folio); + goto drop; + } + + /* Check if there is anything to gain from splitting */ + next = folio_next_index(folio); + end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); + if (end <= folio->index || end >= next) { + folio_put(folio); + goto drop; + } + + /* + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. + * + * Waiting for the lock may lead to deadlock in the + * reclaim path. + */ + if (!folio_trylock(folio)) { + folio_put(folio); + goto move_back; + } + + ret = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + + /* If split failed move the inode on the list back to shrinklist */ + if (ret) + goto move_back; + + freed += next - end; + split++; +drop: + list_del_init(&info->shrinklist); + goto put; +move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: + iput(inode); + } + + return split; +} + +static long shmem_unused_huge_scan(struct super_block *sb, + struct shrink_control *sc) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + if (!READ_ONCE(sbinfo->shrinklist_len)) + return SHRINK_STOP; + + return shmem_unused_huge_shrink(sbinfo, sc, 0); +} + +static long shmem_unused_huge_count(struct super_block *sb, + struct shrink_control *sc) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + return READ_ONCE(sbinfo->shrinklist_len); +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ + +#define shmem_huge SHMEM_HUGE_DENY + +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_free) +{ + return 0; +} + +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static void shmem_update_stats(struct folio *folio, int nr_pages) +{ + if (folio_test_pmd_mappable(folio)) + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); +} + +/* + * Somewhat like filemap_add_folio, but error if expected item has gone. + */ +int shmem_add_to_page_cache(struct folio *folio, + struct address_space *mapping, + pgoff_t index, void *expected, gfp_t gfp) +{ + XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); + unsigned long nr = folio_nr_pages(folio); + swp_entry_t iter, swap; + void *entry; + + VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); + + folio_ref_add(folio, nr); + folio->mapping = mapping; + folio->index = index; + + gfp &= GFP_RECLAIM_MASK; + folio_throttle_swaprate(folio, gfp); + swap = radix_to_swp_entry(expected); + + do { + iter = swap; + xas_lock_irq(&xas); + xas_for_each_conflict(&xas, entry) { + /* + * The range must either be empty, or filled with + * expected swap entries. Shmem swap entries are never + * partially freed without split of both entry and + * folio, so there shouldn't be any holes. + */ + if (!expected || entry != swp_to_radix_entry(iter)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + iter.val += 1 << xas_get_order(&xas); + } + if (expected && iter.val - nr != swap.val) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + xas_store(&xas, folio); + if (xas_error(&xas)) + goto unlock; + shmem_update_stats(folio, nr); + mapping->nrpages += nr; +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); + + if (xas_error(&xas)) { + folio->mapping = NULL; + folio_ref_sub(folio, nr); + return xas_error(&xas); + } + + return 0; } /* - * Like delete_from_page_cache, but substitutes swap for page. + * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ -static void shmem_delete_from_page_cache(struct page *page, void *radswap) +static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; + long nr = folio_nr_pages(folio); int error; - spin_lock_irq(&mapping->tree_lock); - error = shmem_radix_tree_replace(mapping, page->index, page, radswap); - page->mapping = NULL; - mapping->nrpages--; - __dec_zone_page_state(page, NR_FILE_PAGES); - __dec_zone_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); + xa_lock_irq(&mapping->i_pages); + error = shmem_replace_entry(mapping, folio->index, folio, radswap); + folio->mapping = NULL; + mapping->nrpages -= nr; + shmem_update_stats(folio, -nr); + xa_unlock_irq(&mapping->i_pages); + folio_put_refs(folio, nr); BUG_ON(error); } /* - * Like find_get_pages, but collecting swap entries as well as pages. + * Remove swap entry from page cache, free the swap and its page cache. Returns + * the number of pages being freed. 0 means entry not found in XArray (0 pages + * being freed). */ -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages, pgoff_t *indices) +static long shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) { - void **slot; - unsigned int ret = 0; - struct radix_tree_iter iter; + int order = xa_get_order(&mapping->i_pages, index); + void *old; - if (!nr_pages) + old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); + if (old != radswap) return 0; + free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + + return 1 << order; +} + +/* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given offsets are swapped out. + * + * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_partial_swap_usage(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + XA_STATE(xas, &mapping->i_pages, start); + struct folio *folio; + unsigned long swapped = 0; + unsigned long max = end - 1; rcu_read_lock(); -restart: - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; - /* - * Otherwise, we must be storing a swap entry - * here as an exceptional entry: so return it - * without attempting to raise page count. - */ - goto export; - } - if (!page_cache_get_speculative(page)) - goto repeat; - - /* Has the page moved? */ - if (unlikely(page != *slot)) { - page_cache_release(page); - goto repeat; - } -export: - indices[ret] = iter.index; - pages[ret] = page; - if (++ret == nr_pages) + if (xa_is_value(folio)) + swapped += 1 << xas_get_order(&xas); + if (xas.xa_index == max) break; + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } } rcu_read_unlock(); - return ret; + + return swapped << PAGE_SHIFT; } /* - * Remove swap entry from radix tree, free the swap and its page cache. + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given vma is swapped out. + * + * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. */ -static int shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) +unsigned long shmem_swap_usage(struct vm_area_struct *vma) { - int error; + struct inode *inode = file_inode(vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long swapped; - spin_lock_irq(&mapping->tree_lock); - error = shmem_radix_tree_replace(mapping, index, radswap, NULL); - spin_unlock_irq(&mapping->tree_lock); - if (!error) - free_swap_and_cache(radix_to_swp_entry(radswap)); - return error; -} + /* Be careful as we don't hold info->lock */ + swapped = READ_ONCE(info->swapped); -/* - * Pagevec may contain swap entries, so shuffle up pages before releasing. - */ -static void shmem_deswap_pagevec(struct pagevec *pvec) -{ - int i, j; + /* + * The easier cases are when the shmem object has nothing in swap, or + * the vma maps it whole. Then we can simply use the stats that we + * already track. + */ + if (!swapped) + return 0; - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) - pvec->pages[j++] = page; - } - pvec->nr = j; + if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) + return swapped << PAGE_SHIFT; + + /* Here comes the more involved part */ + return shmem_partial_swap_usage(mapping, vma->vm_pgoff, + vma->vm_pgoff + vma_pages(vma)); } /* @@ -416,46 +1046,64 @@ static void shmem_deswap_pagevec(struct pagevec *pvec) */ void shmem_unlock_mapping(struct address_space *mapping) { - struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; + struct folio_batch fbatch; pgoff_t index = 0; - pagevec_init(&pvec, 0); + folio_batch_init(&fbatch); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ - while (!mapping_unevictable(mapping)) { - /* - * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it - * has finished, if it hits a row of PAGEVEC_SIZE swap entries. - */ - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); - if (!pvec.nr) - break; - index = indices[pvec.nr - 1] + 1; - shmem_deswap_pagevec(&pvec); - check_move_unevictable_pages(pvec.pages, pvec.nr); - pagevec_release(&pvec); + while (!mapping_unevictable(mapping) && + filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { + check_move_unevictable_folios(&fbatch); + folio_batch_release(&fbatch); cond_resched(); } } +static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) +{ + struct folio *folio; + + /* + * At first avoid shmem_get_folio(,,,SGP_READ): that fails + * beyond i_size, and reports fallocated folios as holes. + */ + folio = filemap_get_entry(inode->i_mapping, index); + if (!folio) + return folio; + if (!xa_is_value(folio)) { + folio_lock(folio); + if (folio->mapping == inode->i_mapping) + return folio; + /* The folio has been swapped out */ + folio_unlock(folio); + folio_put(folio); + } + /* + * But read a folio back from swap if any of it is within i_size + * (although in some cases this is just a waste of time). + */ + folio = NULL; + shmem_get_folio(inode, index, 0, &folio, SGP_READ); + return folio; +} + /* - * Remove range of pages and swap entries from radix tree, and free them. + * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, +static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; - unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); - unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); - struct pagevec pvec; + pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff_t end = (lend + 1) >> PAGE_SHIFT; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; + struct folio *folio; + bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; @@ -463,426 +1111,593 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (lend == -1) end = -1; /* unsigned, so actually very big */ - pagevec_init(&pvec, 0); - index = start; - while (index < end) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); - if (!pvec.nr) - break; - mem_cgroup_uncharge_start(); - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + if (info->fallocend > start && info->fallocend <= end && !unfalloc) + info->fallocend = start; - index = indices[i]; - if (index >= end) - break; + folio_batch_init(&fbatch); + index = start; + while (index < end && find_lock_entries(mapping, &index, end - 1, + &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + nr_swaps_freed += shmem_free_swap(mapping, + indices[i], folio); continue; } - if (!trylock_page(page)) - continue; - if (!unfalloc || !PageUptodate(page)) { - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); - } - } - unlock_page(page); + if (!unfalloc || !folio_test_uptodate(folio)) + truncate_inode_folio(mapping, folio); + folio_unlock(folio); } - shmem_deswap_pagevec(&pvec); - pagevec_release(&pvec); - mem_cgroup_uncharge_end(); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); - index++; } - if (partial_start) { - struct page *page = NULL; - shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); - if (page) { - unsigned int top = PAGE_CACHE_SIZE; - if (start > end) { - top = partial_end; - partial_end = 0; - } - zero_user_segment(page, partial_start, top); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); + /* + * When undoing a failed fallocate, we want none of the partial folio + * zeroing and splitting below, but shall want to truncate the whole + * folio when !uptodate indicates that it was added by this fallocate, + * even when [lstart, lend] covers only a part of the folio. + */ + if (unfalloc) + goto whole_folios; + + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); + if (folio) { + same_folio = lend < folio_next_pos(folio); + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio_next_index(folio); + if (same_folio) + end = folio->index; } + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - if (partial_end) { - struct page *page = NULL; - shmem_getpage(inode, end, &page, SGP_READ, NULL); - if (page) { - zero_user_segment(page, 0, partial_end); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); - } + + if (!same_folio) + folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); + if (folio) { + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); } - if (start >= end) - return; + +whole_folios: index = start; - for ( ; ; ) { + while (index < end) { cond_resched(); - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); - if (!pvec.nr) { - if (index == start || unfalloc) + + if (!find_get_entries(mapping, &index, end - 1, &fbatch, + indices)) { + /* If all gone or hole-punch or unfalloc, we're done */ + if (index == start || end != -1) break; + /* But if truncating, restart to make sure all gone */ index = start; continue; } - if ((index == start || unfalloc) && indices[0] >= end) { - shmem_deswap_pagevec(&pvec); - pagevec_release(&pvec); - break; - } - mem_cgroup_uncharge_start(); - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; - index = indices[i]; - if (index >= end) - break; + if (xa_is_value(folio)) { + long swaps_freed; - if (radix_tree_exceptional_entry(page)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + swaps_freed = shmem_free_swap(mapping, indices[i], folio); + if (!swaps_freed) { + /* Swap was replaced by page: retry */ + index = indices[i]; + break; + } + nr_swaps_freed += swaps_freed; continue; } - lock_page(page); - if (!unfalloc || !PageUptodate(page)) { - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + folio_lock(folio); + + if (!unfalloc || !folio_test_uptodate(folio)) { + if (folio_mapping(folio) != mapping) { + /* Page was replaced by swap: retry */ + folio_unlock(folio); + index = indices[i]; + break; + } + VM_BUG_ON_FOLIO(folio_test_writeback(folio), + folio); + + if (!folio_test_large(folio)) { + truncate_inode_folio(mapping, folio); + } else if (truncate_inode_partial_folio(folio, lstart, lend)) { + /* + * If we split a page, reset the loop so + * that we pick up the new sub pages. + * Otherwise the THP was entirely + * dropped or the target range was + * zeroed, so just continue the loop as + * is. + */ + if (!folio_test_large(folio)) { + folio_unlock(folio); + index = start; + break; + } } } - unlock_page(page); + folio_unlock(folio); } - shmem_deswap_pagevec(&pvec); - pagevec_release(&pvec); - mem_cgroup_uncharge_end(); - index++; + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); } - spin_lock(&info->lock); - info->swapped -= nr_swaps_freed; - shmem_recalc_inode(inode); - spin_unlock(&info->lock); + shmem_recalc_inode(inode, 0, -nr_swaps_freed); } -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { shmem_undo_range(inode, lstart, lend, false); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); -static int shmem_setattr(struct dentry *dentry, struct iattr *attr) +static int shmem_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = path->dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + + if (info->alloced - info->swapped != inode->i_mapping->nrpages) + shmem_recalc_inode(inode, 0, 0); + + if (info->fsflags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (info->fsflags & FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + generic_fillattr(idmap, request_mask, inode, stat); + + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) + stat->blksize = HPAGE_PMD_SIZE; + + if (request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = info->i_crtime.tv_sec; + stat->btime.tv_nsec = info->i_crtime.tv_nsec; + } + + return 0; +} + +static int shmem_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); int error; + bool update_mtime = false; + bool update_ctime = true; - error = inode_change_ok(inode, attr); + error = setattr_prepare(idmap, dentry, attr); if (error) return error; + if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { + if ((inode->i_mode ^ attr->ia_mode) & 0111) { + return -EPERM; + } + } + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; + /* protected by i_rwsem */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + if (newsize != oldsize) { + if (info->flags & SHMEM_F_MAPPING_FROZEN) + return -EPERM; + error = shmem_reacct_size(SHMEM_I(inode)->flags, + oldsize, newsize); + if (error) + return error; i_size_write(inode, newsize); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; + update_mtime = true; + } else { + update_ctime = false; } - if (newsize < oldsize) { + if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); - unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); - shmem_truncate_range(inode, newsize, (loff_t)-1); + if (oldsize > holebegin) + unmap_mapping_range(inode->i_mapping, + holebegin, 0, 1); + if (info->alloced) + shmem_truncate_range(inode, + newsize, (loff_t)-1); /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + if (oldsize > holebegin) + unmap_mapping_range(inode->i_mapping, + holebegin, 0, 1); } } - setattr_copy(inode, attr); -#ifdef CONFIG_TMPFS_POSIX_ACL + if (is_quota_modification(idmap, inode, attr)) { + error = dquot_initialize(inode); + if (error) + return error; + } + + /* Transfer quota accounting */ + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { + error = dquot_transfer(idmap, inode, attr); + if (error) + return error; + } + + setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) - error = generic_acl_chmod(inode); -#endif + error = posix_acl_chmod(idmap, dentry, inode->i_mode); + if (!error && update_ctime) { + inode_set_ctime_current(inode); + if (update_mtime) + inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); + inode_inc_iversion(inode); + } return error; } static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + size_t freed = 0; - if (inode->i_mapping->a_ops == &shmem_aops) { + if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; + mapping_set_exiting(inode->i_mapping); shmem_truncate_range(inode, 0, (loff_t)-1); - if (!list_empty(&info->swaplist)) { - mutex_lock(&shmem_swaplist_mutex); - list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); + if (!list_empty(&info->shrinklist)) { + spin_lock(&sbinfo->shrinklist_lock); + if (!list_empty(&info->shrinklist)) { + list_del_init(&info->shrinklist); + sbinfo->shrinklist_len--; + } + spin_unlock(&sbinfo->shrinklist_lock); } - } else - kfree(info->symlink); + while (!list_empty(&info->swaplist)) { + /* Wait while shmem_unuse() is scanning this inode... */ + wait_var_event(&info->stop_eviction, + !atomic_read(&info->stop_eviction)); + spin_lock(&shmem_swaplist_lock); + /* ...but beware of the race if we peeked too early */ + if (!atomic_read(&info->stop_eviction)) + list_del_init(&info->swaplist); + spin_unlock(&shmem_swaplist_lock); + } + } - simple_xattrs_free(&info->xattrs); + simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); + shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); - shmem_free_inode(inode->i_sb); clear_inode(inode); +#ifdef CONFIG_TMPFS_QUOTA + dquot_free_inode(inode); + dquot_drop(inode); +#endif } -/* - * If swap found in inode, free it and move page from swapcache to filecache. - */ -static int shmem_unuse_inode(struct shmem_inode_info *info, - swp_entry_t swap, struct page **pagep) +static unsigned int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, struct folio_batch *fbatch, + pgoff_t *indices, unsigned int type) { - struct address_space *mapping = info->vfs_inode.i_mapping; - void *radswap; - pgoff_t index; - gfp_t gfp; - int error = 0; + XA_STATE(xas, &mapping->i_pages, start); + struct folio *folio; + swp_entry_t entry; - radswap = swp_to_radix_entry(swap); - index = radix_tree_locate_item(&mapping->page_tree, radswap); - if (index == -1) - return 0; + rcu_read_lock(); + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) + continue; - /* - * Move _head_ to start search for next from here. - * But be careful: shmem_evict_inode checks list_empty without taking - * mutex, and there's an instant in list_move_tail when info->swaplist - * would appear empty, if it were the only one on shmem_swaplist. - */ - if (shmem_swaplist.next != &info->swaplist) - list_move_tail(&shmem_swaplist, &info->swaplist); - - gfp = mapping_gfp_mask(mapping); - if (shmem_should_replace_page(*pagep, gfp)) { - mutex_unlock(&shmem_swaplist_mutex); - error = shmem_replace_page(pagep, gfp, info, index); - mutex_lock(&shmem_swaplist_mutex); - /* - * We needed to drop mutex to make that restrictive page - * allocation, but the inode might have been freed while we - * dropped it: although a racing shmem_evict_inode() cannot - * complete without emptying the radix_tree, our page lock - * on this swapcache page is not enough to prevent that - - * free_swap_and_cache() of our swap entry will only - * trylock_page(), removing swap from radix_tree whatever. - * - * We must not proceed to shmem_add_to_page_cache() if the - * inode has been freed, but of course we cannot rely on - * inode or mapping or info to check that. However, we can - * safely check if our swap entry is still in use (and here - * it can't have got reused for another page): if it's still - * in use, then the inode cannot have been freed yet, and we - * can safely proceed (if it's no longer in use, that tells - * nothing about the inode, but we don't need to unuse swap). - */ - if (!page_swapcount(*pagep)) - error = -ENOENT; - } + if (!xa_is_value(folio)) + continue; - /* - * We rely on shmem_swaplist_mutex, not only to protect the swaplist, - * but also to hold up shmem_evict_inode(): so inode cannot be freed - * beneath us (pagelock doesn't help until the page is in pagecache). - */ - if (!error) - error = shmem_add_to_page_cache(*pagep, mapping, index, - GFP_NOWAIT, radswap); - if (error != -ENOMEM) { + entry = radix_to_swp_entry(folio); /* - * Truncation and eviction use free_swap_and_cache(), which - * only does trylock page: if we raced, best clean up here. + * swapin error entries can be found in the mapping. But they're + * deliberately ignored here as we've done everything we can do. */ - delete_from_swap_cache(*pagep); - set_page_dirty(*pagep); - if (!error) { - spin_lock(&info->lock); - info->swapped--; - spin_unlock(&info->lock); - swap_free(swap); + if (swp_type(entry) != type) + continue; + + indices[folio_batch_count(fbatch)] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) + break; + + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); } - error = 1; /* not an error, but entry was found */ } - return error; + rcu_read_unlock(); + + return folio_batch_count(fbatch); } /* - * Search through swapped inodes to find and replace swap by page. + * Move the swapped pages for an inode to page cache. Returns the count + * of pages swapped in, or the error in case of failure. */ -int shmem_unuse(swp_entry_t swap, struct page *page) +static int shmem_unuse_swap_entries(struct inode *inode, + struct folio_batch *fbatch, pgoff_t *indices) { - struct list_head *this, *next; - struct shmem_inode_info *info; - int found = 0; + int i = 0; + int ret = 0; int error = 0; + struct address_space *mapping = inode->i_mapping; - /* - * There's a faint possibility that swap page was replaced before - * caller locked it: caller will come back later with the right page. - */ - if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) - goto out; + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; - /* - * Charge page using GFP_KERNEL while we can wait, before taking - * the shmem_swaplist_mutex which might hold up shmem_writepage(). - * Charged back to the user (not to caller) when swap account is used. - */ - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); - if (error) - goto out; - /* No radix_tree_preload: swap entry keeps a place for page in tree */ + error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, + mapping_gfp_mask(mapping), NULL, NULL); + if (error == 0) { + folio_unlock(folio); + folio_put(folio); + ret++; + } + if (error == -ENOMEM) + break; + error = 0; + } + return error ? error : ret; +} - mutex_lock(&shmem_swaplist_mutex); - list_for_each_safe(this, next, &shmem_swaplist) { - info = list_entry(this, struct shmem_inode_info, swaplist); - if (info->swapped) - found = shmem_unuse_inode(info, swap, &page); - else +/* + * If swap found in inode, free it and move page from swapcache to filecache. + */ +static int shmem_unuse_inode(struct inode *inode, unsigned int type) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t start = 0; + struct folio_batch fbatch; + pgoff_t indices[PAGEVEC_SIZE]; + int ret = 0; + + do { + folio_batch_init(&fbatch); + if (!shmem_find_swap_entries(mapping, start, &fbatch, + indices, type)) { + ret = 0; + break; + } + + ret = shmem_unuse_swap_entries(inode, &fbatch, indices); + if (ret < 0) + break; + + start = indices[folio_batch_count(&fbatch) - 1]; + } while (true); + + return ret; +} + +/* + * Read all the shared memory data that resides in the swap + * device 'type' back into memory, so the swap device can be + * unused. + */ +int shmem_unuse(unsigned int type) +{ + struct shmem_inode_info *info, *next; + int error = 0; + + if (list_empty(&shmem_swaplist)) + return 0; + + spin_lock(&shmem_swaplist_lock); +start_over: + list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { + if (!info->swapped) { list_del_init(&info->swaplist); + continue; + } + /* + * Drop the swaplist mutex while searching the inode for swap; + * but before doing so, make sure shmem_evict_inode() will not + * remove placeholder inode from swaplist, nor let it be freed + * (igrab() would protect from unlink, but not from unmount). + */ + atomic_inc(&info->stop_eviction); + spin_unlock(&shmem_swaplist_lock); + + error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); - if (found) + + spin_lock(&shmem_swaplist_lock); + if (atomic_dec_and_test(&info->stop_eviction)) + wake_up_var(&info->stop_eviction); + if (error) break; + if (list_empty(&info->swaplist)) + goto start_over; + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); } - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); - if (found < 0) - error = found; -out: - unlock_page(page); - page_cache_release(page); return error; } -/* - * Move the page from the page cache to the swap cache. +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @plug: swap plug + * @folio_list: list to put back folios on split + * + * Move the folio from the page cache to the swap cache. */ -static int shmem_writepage(struct page *page, struct writeback_control *wbc) +int shmem_writeout(struct folio *folio, struct swap_iocb **plug, + struct list_head *folio_list) { - struct shmem_inode_info *info; - struct address_space *mapping; - struct inode *inode; - swp_entry_t swap; + struct address_space *mapping = folio->mapping; + struct inode *inode = mapping->host; + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); pgoff_t index; + int nr_pages; + bool split = false; - BUG_ON(!PageLocked(page)); - mapping = page->mapping; - index = page->index; - inode = mapping->host; - info = SHMEM_I(inode); - if (info->flags & VM_LOCKED) + if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap) goto redirty; + if (!total_swap_pages) goto redirty; /* - * shmem_backing_dev_info's capabilities prevent regular writeback or - * sync from ever calling shmem_writepage; but a stacking filesystem - * might use ->writepage of its underlying filesystem, in which case - * tmpfs should write out to swap only in response to memory pressure, - * and not for the writeback threads or sync. + * If CONFIG_THP_SWAP is not enabled, the large folio should be + * split when swapping. + * + * And shrinkage of pages beyond i_size does not split swap, so + * swapout of a large folio crossing i_size needs to split too + * (unless fallocate has been used to preallocate beyond EOF). */ - if (!wbc->for_reclaim) { - WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ - goto redirty; + if (folio_test_large(folio)) { + index = shmem_fallocend(inode, + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); + if ((index > folio->index && index < folio_next_index(folio)) || + !IS_ENABLED(CONFIG_THP_SWAP)) + split = true; } + if (split) { +try_split: + /* Ensure the subpages are still dirty */ + folio_test_set_dirty(folio); + if (split_folio_to_list(folio, folio_list)) + goto redirty; + folio_clear_dirty(folio); + } + + index = folio->index; + nr_pages = folio_nr_pages(folio); + /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a - * fallocated page arriving here is now to initialize it and write it. + * fallocated folio arriving here is now to initialize it and write it. * - * That's okay for a page already fallocated earlier, but if we have + * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track - * of this page in case we have to undo it, and (b) it may not be a + * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So - * reactivate the page, and let shmem_fallocate() quit when too many. + * reactivate the folio, and let shmem_fallocate() quit when too many. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && + !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) - shmem_falloc->nr_unswapped++; + shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); if (shmem_falloc) goto redirty; } - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } - swap = get_swap_page(); - if (!swap.val) - goto redirty; + if (!folio_alloc_swap(folio)) { + bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); + int error; - /* - * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the page is - * moved to swap cache, when its pagelock no longer protects - * the inode from eviction. But don't unlock the mutex until - * we've incremented swapped, because shmem_unuse_inode() will - * prune a !swapped inode from the swaplist under this mutex. - */ - mutex_lock(&shmem_swaplist_mutex); - if (list_empty(&info->swaplist)) - list_add_tail(&info->swaplist, &shmem_swaplist); + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now before the folio is + * removed from page cache, when its pagelock no longer + * protects the inode from eviction. And do it now, after + * we've incremented swapped, because shmem_unuse() will + * prune a !swapped inode from the swaplist. + */ + if (first_swapped) { + spin_lock(&shmem_swaplist_lock); + if (list_empty(&info->swaplist)) + list_add(&info->swaplist, &shmem_swaplist); + spin_unlock(&shmem_swaplist_lock); + } - if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { - swap_shmem_alloc(swap); - shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + swap_shmem_alloc(folio->swap, nr_pages); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); - spin_lock(&info->lock); - info->swapped++; - shmem_recalc_inode(inode); - spin_unlock(&info->lock); + BUG_ON(folio_mapped(folio)); + error = swap_writeout(folio, plug); + if (error != AOP_WRITEPAGE_ACTIVATE) { + /* folio has been unlocked */ + return error; + } - mutex_unlock(&shmem_swaplist_mutex); - BUG_ON(page_mapped(page)); - swap_writepage(page, wbc); - return 0; - } + /* + * The intention here is to avoid holding on to the swap when + * zswap was unable to compress and unable to writeback; but + * it will be appropriate if other reactivate cases are added. + */ + error = shmem_add_to_page_cache(folio, mapping, index, + swp_to_radix_entry(folio->swap), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + /* Swap entry might be erased by racing shmem_free_swap() */ + if (!error) { + shmem_recalc_inode(inode, 0, -nr_pages); + swap_free_nr(folio->swap, nr_pages); + } - mutex_unlock(&shmem_swaplist_mutex); - swapcache_free(swap, NULL); + /* + * The swap_cache_del_folio() below could be left for + * shrink_folio_list()'s folio_free_swap() to dispose of; + * but I'm a little nervous about letting this folio out of + * shmem_writeout() in a hybrid half-tmpfs-half-swap state + * e.g. folio_mapping(folio) might give an unexpected answer. + */ + swap_cache_del_folio(folio); + goto redirty; + } + if (nr_pages > 1) + goto try_split; redirty: - set_page_dirty(page); - if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ - unlock_page(page); - return 0; + folio_mark_dirty(folio); + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } +EXPORT_SYMBOL_GPL(shmem_writeout); -#ifdef CONFIG_NUMA -#ifdef CONFIG_TMPFS +#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; @@ -899,86 +1714,379 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { - spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } -#endif /* CONFIG_TMPFS */ +#else /* !CONFIG_NUMA || !CONFIG_TMPFS */ +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ +} +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + return NULL; +} +#endif /* CONFIG_NUMA && CONFIG_TMPFS */ + +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx); -static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct page *page; + struct mempolicy *mpol; + pgoff_t ilx; + struct folio *folio; - /* Create a pseudo vma that just contains the policy */ - pvma.vm_start = 0; - /* Bias interleave by inode number to distribute better across nodes */ - pvma.vm_pgoff = index + info->vfs_inode.i_ino; - pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + folio = swap_cluster_readahead(swap, gfp, mpol, ilx); + mpol_cond_put(mpol); - page = swapin_readahead(swap, gfp, &pvma, 0); + return folio; +} - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(pvma.vm_policy); +/* + * Make sure huge_gfp is always more limited than limit_gfp. + * Some of the flags set permissions, while others set limitations. + */ +static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; + gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; + gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); - return page; + /* Allow allocations only from the originally specified zones. */ + result |= zoneflags; + + /* + * Minimize the result gfp by taking the union with the deny flags, + * and the intersection of the allow flags. + */ + result |= (limit_gfp & denyflags); + result |= (huge_gfp & limit_gfp) & allowflags; + + return result; } -static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +bool shmem_hpage_pmd_enabled(void) { - struct vm_area_struct pvma; - struct page *page; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) && + shmem_huge != SHMEM_HUGE_NEVER) + return true; + + return false; +} - /* Create a pseudo vma that just contains the policy */ - pvma.vm_start = 0; - /* Bias interleave by inode number to distribute better across nodes */ - pvma.vm_pgoff = index + info->vfs_inode.i_ino; - pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); +unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + loff_t write_end, bool shmem_huge_force) +{ + unsigned long mask = READ_ONCE(huge_shmem_orders_always); + unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); + vm_flags_t vm_flags = vma ? vma->vm_flags : 0; + unsigned int global_orders; - page = alloc_page_vma(gfp, &pvma, 0); + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) + return 0; - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(pvma.vm_policy); + global_orders = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); + /* Tmpfs huge pages allocation */ + if (!vma || !vma_is_anon_shmem(vma)) + return global_orders; - return page; + /* + * Following the 'deny' semantics of the top level, force the huge + * option off from all mounts. + */ + if (shmem_huge == SHMEM_HUGE_DENY) + return 0; + + /* + * Only allow inherit orders if the top-level value is 'force', which + * means non-PMD sized THP can not override 'huge' mount option now. + */ + if (shmem_huge == SHMEM_HUGE_FORCE) + return READ_ONCE(huge_shmem_orders_inherit); + + /* Allow mTHP that will be fully within i_size. */ + mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_shmem_orders_madvise); + + if (global_orders > 0) + mask |= READ_ONCE(huge_shmem_orders_inherit); + + return THP_ORDERS_ALL_FILE_DEFAULT & mask; } -#else /* !CONFIG_NUMA */ -#ifdef CONFIG_TMPFS -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) + +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) { + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + pgoff_t aligned_index; + unsigned long pages; + int order; + + if (vma) { + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + if (!orders) + return 0; + } + + /* Find the highest order that can add into the page cache */ + order = highest_order(orders); + while (orders) { + pages = 1UL << order; + aligned_index = round_down(index, pages); + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ + if (!xa_find(&mapping->i_pages, &aligned_index, + aligned_index + pages - 1, XA_PRESENT)) + break; + order = next_order(&orders, order); + } + + return orders; } -#endif /* CONFIG_TMPFS */ +#else +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) +static struct folio *shmem_alloc_folio(gfp_t gfp, int order, + struct shmem_inode_info *info, pgoff_t index) { - return swapin_readahead(swap, gfp, NULL, 0); + struct mempolicy *mpol; + pgoff_t ilx; + struct folio *folio; + + mpol = shmem_get_pgoff_policy(info, index, order, &ilx); + folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); + + return folio; } -static inline struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) +static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, + gfp_t gfp, struct inode *inode, pgoff_t index, + struct mm_struct *fault_mm, unsigned long orders) { - return alloc_page(gfp); + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long suitable_orders = 0; + struct folio *folio = NULL; + pgoff_t aligned_index; + long pages; + int error, order; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + orders = 0; + + if (orders > 0) { + suitable_orders = shmem_suitable_orders(inode, vmf, + mapping, index, orders); + + order = highest_order(suitable_orders); + while (suitable_orders) { + pages = 1UL << order; + aligned_index = round_down(index, pages); + folio = shmem_alloc_folio(gfp, order, info, aligned_index); + if (folio) { + index = aligned_index; + goto allocated; + } + + if (pages == HPAGE_PMD_NR) + count_vm_event(THP_FILE_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); + order = next_order(&suitable_orders, order); + } + } else { + pages = 1; + folio = shmem_alloc_folio(gfp, 0, info, index); + } + if (!folio) + return ERR_PTR(-ENOMEM); + +allocated: + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + gfp &= GFP_RECLAIM_MASK; + error = mem_cgroup_charge(folio, fault_mm, gfp); + if (error) { + if (xa_find(&mapping->i_pages, &index, + index + pages - 1, XA_PRESENT)) { + error = -EEXIST; + } else if (pages > 1) { + if (pages == HPAGE_PMD_NR) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); + } + goto unlock; + } + + error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); + if (error) + goto unlock; + + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + long freed; + /* + * Try to reclaim some space by splitting a few + * large folios beyond i_size on the filesystem. + */ + shmem_unused_huge_shrink(sbinfo, NULL, pages); + /* + * And do a shmem_recalc_inode() to account for freed pages: + * except our folio is there in cache, so not quite balanced. + */ + spin_lock(&info->lock); + freed = pages + info->alloced - info->swapped - + READ_ONCE(mapping->nrpages); + if (freed > 0) + info->alloced -= freed; + spin_unlock(&info->lock); + if (freed > 0) + shmem_inode_unacct_blocks(inode, freed); + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + filemap_remove_folio(folio); + goto unlock; + } + } + + shmem_recalc_inode(inode, pages, 0); + folio_add_lru(folio); + return folio; + +unlock: + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(error); } -#endif /* CONFIG_NUMA */ -#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) -static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +static struct folio *shmem_swap_alloc_folio(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + swp_entry_t entry, int order, gfp_t gfp) { - return NULL; + struct shmem_inode_info *info = SHMEM_I(inode); + int nr_pages = 1 << order; + struct folio *new; + gfp_t alloc_gfp; + void *shadow; + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success with further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + alloc_gfp = gfp; + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (WARN_ON_ONCE(order)) + return ERR_PTR(-EINVAL); + } else if (order) { + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. + */ + if ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled() || + non_swapcache_batch(entry, nr_pages) != nr_pages) + goto fallback; + + alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); + } +retry: + new = shmem_alloc_folio(alloc_gfp, order, info, index); + if (!new) { + new = ERR_PTR(-ENOMEM); + goto fallback; + } + + if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, + alloc_gfp, entry)) { + folio_put(new); + new = ERR_PTR(-ENOMEM); + goto fallback; + } + + /* + * Prevent parallel swapin from proceeding with the swap cache flag. + * + * Of course there is another possible concurrent scenario as well, + * that is to say, the swap cache flag of a large folio has already + * been set by swapcache_prepare(), while another thread may have + * already split the large swap entry stored in the shmem mapping. + * In this case, shmem_add_to_page_cache() will help identify the + * concurrent swapin and return -EEXIST. + */ + if (swapcache_prepare(entry, nr_pages)) { + folio_put(new); + new = ERR_PTR(-EEXIST); + /* Try smaller folio to avoid cache conflict */ + goto fallback; + } + + __folio_set_locked(new); + __folio_set_swapbacked(new); + new->swap = entry; + + memcg1_swapin(entry, nr_pages); + shadow = swap_cache_get_shadow(entry); + if (shadow) + workingset_refault(new, shadow); + folio_add_lru(new); + swap_read_folio(new, NULL); + return new; +fallback: + /* Order 0 swapin failed, nothing to fallback to, abort */ + if (!order) + return new; + entry.val += index - round_down(index, nr_pages); + alloc_gfp = gfp; + nr_pages = 1; + order = 0; + goto retry; } -#endif /* * When a page is moved from swapcache to shmem filecache (either by the - * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), @@ -988,329 +2096,783 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); * but for now it is a simple matter of zone. */ -static bool shmem_should_replace_page(struct page *page, gfp_t gfp) +static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) { - return page_zonenum(page) > gfp_zone(gfp); + return folio_zonenum(folio) > gfp_zone(gfp); } -static int shmem_replace_page(struct page **pagep, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) +static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index, + struct vm_area_struct *vma) { - struct page *oldpage, *newpage; - struct address_space *swap_mapping; - pgoff_t swap_index; - int error; - - oldpage = *pagep; - swap_index = page_private(oldpage); - swap_mapping = page_mapping(oldpage); + struct swap_cluster_info *ci; + struct folio *new, *old = *foliop; + swp_entry_t entry = old->swap; + int nr_pages = folio_nr_pages(old); + int error = 0; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - newpage = shmem_alloc_page(gfp, info, index); - if (!newpage) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages > 1) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } +#endif + + new = shmem_alloc_folio(gfp, folio_order(old), info, index); + if (!new) return -ENOMEM; - page_cache_get(newpage); - copy_highpage(newpage, oldpage); - flush_dcache_page(newpage); + folio_ref_add(new, nr_pages); + folio_copy(new, old); + flush_dcache_folio(new); + + __folio_set_locked(new); + __folio_set_swapbacked(new); + folio_mark_uptodate(new); + new->swap = entry; + folio_set_swapcache(new); - __set_page_locked(newpage); - SetPageUptodate(newpage); - SetPageSwapBacked(newpage); - set_page_private(newpage, swap_index); - SetPageSwapCache(newpage); + ci = swap_cluster_get_and_lock_irq(old); + __swap_cache_replace_folio(ci, old, new); + mem_cgroup_replace_folio(old, new); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); + swap_cluster_unlock_irq(ci); + folio_add_lru(new); + *foliop = new; + + folio_clear_swapcache(old); + old->private = NULL; + + folio_unlock(old); /* - * Our caller will very soon move newpage out of swapcache, but it's - * a nice clean interface for us to replace oldpage by newpage there. + * The old folio are removed from swap cache, drop the 'nr_pages' + * reference, as well as one temporary reference getting from swap + * cache. */ - spin_lock_irq(&swap_mapping->tree_lock); - error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, - newpage); - if (!error) { - __inc_zone_page_state(newpage, NR_FILE_PAGES); - __dec_zone_page_state(oldpage, NR_FILE_PAGES); + folio_put_refs(old, nr_pages + 1); + return error; +} + +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, + struct folio *folio, swp_entry_t swap, + bool skip_swapcache) +{ + struct address_space *mapping = inode->i_mapping; + swp_entry_t swapin_error; + void *old; + int nr_pages; + + swapin_error = make_poisoned_swp_entry(); + old = xa_cmpxchg_irq(&mapping->i_pages, index, + swp_to_radix_entry(swap), + swp_to_radix_entry(swapin_error), 0); + if (old != swp_to_radix_entry(swap)) + return; + + nr_pages = folio_nr_pages(folio); + folio_wait_writeback(folio); + if (!skip_swapcache) + swap_cache_del_folio(folio); + /* + * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks + * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) + * in shmem_evict_inode(). + */ + shmem_recalc_inode(inode, -nr_pages, -nr_pages); + swap_free_nr(swap, nr_pages); +} + +static int shmem_split_large_entry(struct inode *inode, pgoff_t index, + swp_entry_t swap, gfp_t gfp) +{ + struct address_space *mapping = inode->i_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); + int split_order = 0; + int i; + + /* Convert user data gfp flags to xarray node gfp flags */ + gfp &= GFP_RECLAIM_MASK; + + for (;;) { + void *old = NULL; + int cur_order; + pgoff_t swap_index; + + xas_lock_irq(&xas); + old = xas_load(&xas); + if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + + cur_order = xas_get_order(&xas); + if (!cur_order) + goto unlock; + + /* Try to split large swap entry in pagecache */ + swap_index = round_down(index, 1 << cur_order); + split_order = xas_try_split_min_order(cur_order); + + while (cur_order > 0) { + pgoff_t aligned_index = + round_down(index, 1 << cur_order); + pgoff_t swap_offset = aligned_index - swap_index; + + xas_set_order(&xas, index, split_order); + xas_try_split(&xas, old, cur_order); + if (xas_error(&xas)) + goto unlock; + + /* + * Re-set the swap entry after splitting, and the swap + * offset of the original large entry must be continuous. + */ + for (i = 0; i < 1 << cur_order; + i += (1 << split_order)) { + swp_entry_t tmp; + + tmp = swp_entry(swp_type(swap), + swp_offset(swap) + swap_offset + + i); + __xa_store(&mapping->i_pages, aligned_index + i, + swp_to_radix_entry(tmp), 0); + } + cur_order = split_order; + split_order = xas_try_split_min_order(split_order); + } + +unlock: + xas_unlock_irq(&xas); + + if (!xas_nomem(&xas, gfp)) + break; + } + + if (xas_error(&xas)) + return xas_error(&xas); + + return 0; +} + +/* + * Swap in the folio pointed to by *foliop. + * Caller has to make sure that *foliop contains a valid swapped folio. + * Returns 0 and the folio in foliop if success. On failure, returns the + * error code and NULL in *foliop. + */ +static int shmem_swapin_folio(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type) +{ + struct address_space *mapping = inode->i_mapping; + struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; + struct shmem_inode_info *info = SHMEM_I(inode); + swp_entry_t swap; + softleaf_t index_entry; + struct swap_info_struct *si; + struct folio *folio = NULL; + bool skip_swapcache = false; + int error, nr_pages, order; + pgoff_t offset; + + VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); + index_entry = radix_to_swp_entry(*foliop); + swap = index_entry; + *foliop = NULL; + + if (softleaf_is_poison_marker(index_entry)) + return -EIO; + + si = get_swap_device(index_entry); + order = shmem_confirm_swap(mapping, index, index_entry); + if (unlikely(!si)) { + if (order < 0) + return -EEXIST; + else + return -EINVAL; + } + if (unlikely(order < 0)) { + put_swap_device(si); + return -EEXIST; + } + + /* index may point to the middle of a large entry, get the sub entry */ + if (order) { + offset = index - round_down(index, 1 << order); + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } + + /* Look it up and read it in.. */ + folio = swap_cache_get_folio(swap); + if (!folio) { + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + /* Direct swapin skipping swap cache & readahead */ + folio = shmem_swap_alloc_folio(inode, vma, index, + index_entry, order, gfp); + if (IS_ERR(folio)) { + error = PTR_ERR(folio); + folio = NULL; + goto failed; + } + skip_swapcache = true; + } else { + /* Cached swapin only supports order 0 folio */ + folio = shmem_swapin_cluster(swap, gfp, info, index); + if (!folio) { + error = -ENOMEM; + goto failed; + } + } + if (fault_type) { + *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(fault_mm, PGMAJFAULT); + } + } else { + swap_update_readahead(folio, NULL, 0); } - spin_unlock_irq(&swap_mapping->tree_lock); - if (unlikely(error)) { + if (order > folio_order(folio)) { /* - * Is this possible? I think not, now that our callers check - * both PageSwapCache and page_private after getting page lock; - * but be defensive. Reverse old to newpage for clear and free. + * Swapin may get smaller folios due to various reasons: + * It may fallback to order 0 due to memory pressure or race, + * swap readahead may swap in order 0 folios into swapcache + * asynchronously, while the shmem mapping can still stores + * large swap entries. In such cases, we should split the + * large swap entry to prevent possible data corruption. */ - oldpage = newpage; + error = shmem_split_large_entry(inode, index, index_entry, gfp); + if (error) + goto failed_nolock; + } + + /* + * If the folio is large, round down swap and index by folio size. + * No matter what race occurs, the swap layer ensures we either get + * a valid folio that has its swap entry aligned by size, or a + * temporarily invalid one which we'll abort very soon and retry. + * + * shmem_add_to_page_cache ensures the whole range contains expected + * entries and prevents any corruption, so any race split is fine + * too, it will succeed as long as the entries are still there. + */ + nr_pages = folio_nr_pages(folio); + if (nr_pages > 1) { + swap.val = round_down(swap.val, nr_pages); + index = round_down(index, nr_pages); + } + + /* + * We have to do this with the folio locked to prevent races. + * The shmem_confirm_swap below only checks if the first swap + * entry matches the folio, that's enough to ensure the folio + * is not used outside of shmem, as shmem swap entries + * and swap cache folios are never partially freed. + */ + folio_lock(folio); + if ((!skip_swapcache && !folio_test_swapcache(folio)) || + shmem_confirm_swap(mapping, index, swap) < 0 || + folio->swap.val != swap.val) { + error = -EEXIST; + goto unlock; + } + if (!folio_test_uptodate(folio)) { + error = -EIO; + goto failed; + } + folio_wait_writeback(folio); + + /* + * Some architectures may have to restore extra metadata to the + * folio after reading from swap. + */ + arch_swap_restore(folio_swap(swap, folio), folio); + + if (shmem_should_replace_folio(folio, gfp)) { + error = shmem_replace_folio(&folio, gfp, info, index, vma); + if (error) + goto failed; + } + + error = shmem_add_to_page_cache(folio, mapping, index, + swp_to_radix_entry(swap), gfp); + if (error) + goto failed; + + shmem_recalc_inode(inode, 0, -nr_pages); + + if (sgp == SGP_WRITE) + folio_mark_accessed(folio); + + if (skip_swapcache) { + folio->swap.val = 0; + swapcache_clear(si, swap, nr_pages); } else { - mem_cgroup_replace_page_cache(oldpage, newpage); - lru_cache_add_anon(newpage); - *pagep = newpage; + swap_cache_del_folio(folio); } + folio_mark_dirty(folio); + swap_free_nr(swap, nr_pages); + put_swap_device(si); - ClearPageSwapCache(oldpage); - set_page_private(oldpage, 0); + *foliop = folio; + return 0; +failed: + if (shmem_confirm_swap(mapping, index, swap) < 0) + error = -EEXIST; + if (error == -EIO) + shmem_set_folio_swapin_error(inode, index, folio, swap, + skip_swapcache); +unlock: + if (folio) + folio_unlock(folio); +failed_nolock: + if (skip_swapcache) + swapcache_clear(si, folio->swap, folio_nr_pages(folio)); + if (folio) + folio_put(folio); + put_swap_device(si); - unlock_page(oldpage); - page_cache_release(oldpage); - page_cache_release(oldpage); return error; } /* - * shmem_getpage_gfp - find page in cache, or get from swap, or allocate + * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap - * entry since a page cannot live in both the swap and page cache + * entry since a page cannot live in both the swap and page cache. + * + * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) +static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, + loff_t write_end, struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { - struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info; - struct shmem_sb_info *sbinfo; - struct page *page; - swp_entry_t swap; + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + struct mm_struct *fault_mm; + struct folio *folio; int error; - int once = 0; - int alloced = 0; + bool alloced; + unsigned long orders = 0; + + if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) + return -EINVAL; - if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) + if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: - swap.val = 0; - page = find_lock_page(mapping, index); - if (radix_tree_exceptional_entry(page)) { - swap = radix_to_swp_entry(page); - page = NULL; + if (sgp <= SGP_CACHE && + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) + return -EINVAL; + + alloced = false; + fault_mm = vma ? vma->vm_mm : NULL; + + folio = filemap_get_entry(inode->i_mapping, index); + if (folio && vma && userfaultfd_minor(vma)) { + if (!xa_is_value(folio)) + folio_put(folio); + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); + return 0; } - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && - ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { - error = -EINVAL; - goto failed; + if (xa_is_value(folio)) { + error = shmem_swapin_folio(inode, index, &folio, + sgp, gfp, vma, fault_type); + if (error == -EEXIST) + goto repeat; + + *foliop = folio; + return error; } - /* fallocated page? */ - if (page && !PageUptodate(page)) { + if (folio) { + folio_lock(folio); + + /* Has the folio been truncated or swapped out? */ + if (unlikely(folio->mapping != inode->i_mapping)) { + folio_unlock(folio); + folio_put(folio); + goto repeat; + } + if (sgp == SGP_WRITE) + folio_mark_accessed(folio); + if (folio_test_uptodate(folio)) + goto out; + /* fallocated folio */ if (sgp != SGP_READ) goto clear; - unlock_page(page); - page_cache_release(page); - page = NULL; - } - if (page || (sgp == SGP_READ && !swap.val)) { - *pagep = page; - return 0; + folio_unlock(folio); + folio_put(folio); } /* - * Fast cache lookup did not find it: - * bring it back from swap or allocate. + * SGP_READ: succeed on hole, with NULL folio, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ - info = SHMEM_I(inode); - sbinfo = SHMEM_SB(inode->i_sb); - - if (swap.val) { - /* Look it up and read it in.. */ - page = lookup_swap_cache(swap); - if (!page) { - /* here we actually do the io */ - if (fault_type) - *fault_type |= VM_FAULT_MAJOR; - page = shmem_swapin(swap, gfp, info, index); - if (!page) { - error = -ENOMEM; - goto failed; - } - } - - /* We have to do this with page locked to prevent races */ - lock_page(page); - if (!PageSwapCache(page) || page_private(page) != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { - error = -EEXIST; /* try again */ - goto unlock; - } - if (!PageUptodate(page)) { - error = -EIO; - goto failed; - } - wait_on_page_writeback(page); - - if (shmem_should_replace_page(page, gfp)) { - error = shmem_replace_page(&page, gfp, info, index); - if (error) - goto failed; - } - - error = mem_cgroup_cache_charge(page, current->mm, - gfp & GFP_RECLAIM_MASK); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - gfp, swp_to_radix_entry(swap)); - /* - * We already confirmed swap under page lock, and make - * no memory allocation here, so usually no possibility - * of error; but free_swap_and_cache() only trylocks a - * page, so it is just possible that the entry has been - * truncated or holepunched since swap was confirmed. - * shmem_undo_range() will have done some of the - * unaccounting, now delete_from_swap_cache() will do - * the rest (including mem_cgroup_uncharge_swapcache). - * Reset swap.val? No, leave it so "failed" goes back to - * "repeat": reading a hole and writing should succeed. - */ - if (error) - delete_from_swap_cache(page); - } - if (error) - goto failed; - - spin_lock(&info->lock); - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock(&info->lock); - - delete_from_swap_cache(page); - set_page_dirty(page); - swap_free(swap); + *foliop = NULL; + if (sgp == SGP_READ) + return 0; + if (sgp == SGP_NOALLOC) + return -ENOENT; - } else { - if (shmem_acct_block(info->flags)) { - error = -ENOSPC; - goto failed; - } - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks) >= 0) { - error = -ENOSPC; - goto unacct; - } - percpu_counter_inc(&sbinfo->used_blocks); - } + /* + * Fast cache lookup and swap lookup did not find it: allocate. + */ - page = shmem_alloc_page(gfp, info, index); - if (!page) { - error = -ENOMEM; - goto decused; - } + if (vma && userfaultfd_missing(vma)) { + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); + return 0; + } - SetPageSwapBacked(page); - __set_page_locked(page); - error = mem_cgroup_cache_charge(page, current->mm, - gfp & GFP_RECLAIM_MASK); - if (error) - goto decused; - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); - radix_tree_preload_end(); - } - if (error) { - mem_cgroup_uncharge_cache_page(page); - goto decused; + /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ + orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); + if (orders > 0) { + gfp_t huge_gfp; + + huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); + folio = shmem_alloc_and_add_folio(vmf, huge_gfp, + inode, index, fault_mm, orders); + if (!IS_ERR(folio)) { + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_FILE_ALLOC); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); + goto alloced; } - lru_cache_add_anon(page); + if (PTR_ERR(folio) == -EEXIST) + goto repeat; + } - spin_lock(&info->lock); - info->alloced++; - inode->i_blocks += BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock(&info->lock); - alloced = true; + folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); + if (IS_ERR(folio)) { + error = PTR_ERR(folio); + if (error == -EEXIST) + goto repeat; + folio = NULL; + goto unlock; + } +alloced: + alloced = true; + if (folio_test_large(folio) && + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < + folio_next_index(folio)) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); /* - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + * Part of the large folio is beyond i_size: subject + * to shrink under memory pressure. */ - if (sgp == SGP_FALLOC) - sgp = SGP_WRITE; -clear: + spin_lock(&sbinfo->shrinklist_lock); /* - * Let SGP_WRITE caller clear ends if write does not fill page; - * but SGP_FALLOC on a page fallocated earlier must initialize - * it now, lest undo on failure cancel our earlier guarantee. + * _careful to defend against unlocked access to + * ->shrink_list in shmem_unused_huge_shrink() */ - if (sgp != SGP_WRITE) { - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + if (list_empty_careful(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; } - if (sgp == SGP_DIRTY) - set_page_dirty(page); + spin_unlock(&sbinfo->shrinklist_lock); + } + + if (sgp == SGP_WRITE) + folio_set_referenced(folio); + /* + * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; +clear: + /* + * Let SGP_WRITE caller clear ends if write does not fill folio; + * but SGP_FALLOC on a folio fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. + */ + if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { + long i, n = folio_nr_pages(folio); + + for (i = 0; i < n; i++) + clear_highpage(folio_page(folio, i)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && - ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + if (sgp <= SGP_CACHE && + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; - if (alloced) - goto trunc; - else - goto failed; + goto unlock; } - *pagep = page; +out: + *foliop = folio; return 0; /* * Error recovery. */ -trunc: - info = SHMEM_I(inode); - ClearPageDirty(page); - delete_from_page_cache(page); - spin_lock(&info->lock); - info->alloced--; - inode->i_blocks -= BLOCKS_PER_PAGE; - spin_unlock(&info->lock); -decused: - sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -1); -unacct: - shmem_unacct_blocks(info->flags, 1); -failed: - if (swap.val && error != -EINVAL && - !shmem_confirm_swap(mapping, index, swap)) - error = -EEXIST; unlock: - if (page) { - unlock_page(page); - page_cache_release(page); + if (alloced) + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); + if (folio) { + folio_unlock(folio); + folio_put(folio); } - if (error == -ENOSPC && !once++) { - info = SHMEM_I(inode); - spin_lock(&info->lock); - shmem_recalc_inode(inode); - spin_unlock(&info->lock); - goto repeat; - } - if (error == -EEXIST) /* from above or from radix_tree_insert */ - goto repeat; return error; } -static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +/** + * shmem_get_folio - find, and lock a shmem folio. + * @inode: inode to search + * @index: the page index. + * @write_end: end of a write, could extend inode size + * @foliop: pointer to the folio if found + * @sgp: SGP_* flags to control behavior + * + * Looks up the page cache entry at @inode & @index. If a folio is + * present, it is returned locked with an increased refcount. + * + * If the caller modifies data in the folio, it must call folio_mark_dirty() + * before unlocking the folio to ensure that the folio is not reclaimed. + * There is no need to reserve space before calling folio_mark_dirty(). + * + * When no folio is found, the behavior depends on @sgp: + * - for SGP_READ, *@foliop is %NULL and 0 is returned + * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned + * - for all other flags a new folio is allocated, inserted into the + * page cache and returned locked in @foliop. + * + * Context: May sleep. + * Return: 0 if successful, else a negative error code. + */ +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp) +{ + return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL); +} +EXPORT_SYMBOL_GPL(shmem_get_folio); + +/* + * This is like autoremove_wake_function, but it removes the wait queue + * entry unconditionally - even if something else had already woken the + * target. + */ +static int synchronous_wake_function(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) { - struct inode *inode = file_inode(vma->vm_file); - int error; - int ret = VM_FAULT_LOCKED; + int ret = default_wake_function(wait, mode, sync, key); + list_del_init(&wait->entry); + return ret; +} - error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); - if (error) - return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); +/* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_rwsem. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_rwsem in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ +static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) +{ + struct shmem_falloc *shmem_falloc; + struct file *fpin = NULL; + vm_fault_t ret = 0; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); + + ret = VM_FAULT_NOPAGE; + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); - if (ret & VM_FAULT_MAJOR) { - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + } + spin_unlock(&inode->i_lock); + if (fpin) { + fput(fpin); + ret = VM_FAULT_RETRY; } return ret; } +static vm_fault_t shmem_fault(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + struct folio *folio = NULL; + vm_fault_t ret = 0; + int err; + + /* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: noted in i_private. + */ + if (unlikely(inode->i_private)) { + ret = shmem_falloc_wait(vmf, inode); + if (ret) + return ret; + } + + WARN_ON_ONCE(vmf->page != NULL); + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, + gfp, vmf, &ret); + if (err) + return vmf_error(err); + if (folio) { + vmf->page = folio_file_page(folio, vmf->pgoff); + ret |= VM_FAULT_LOCKED; + } + return ret; +} + +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long uaddr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long addr; + unsigned long offset; + unsigned long inflated_len; + unsigned long inflated_addr; + unsigned long inflated_offset; + unsigned long hpage_size; + + if (len > TASK_SIZE) + return -ENOMEM; + + addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags); + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return addr; + if (IS_ERR_VALUE(addr)) + return addr; + if (addr & ~PAGE_MASK) + return addr; + if (addr > TASK_SIZE - len) + return addr; + + if (shmem_huge == SHMEM_HUGE_DENY) + return addr; + if (flags & MAP_FIXED) + return addr; + /* + * Our priority is to support MAP_SHARED mapped hugely; + * and support MAP_PRIVATE mapped hugely too, until it is COWed. + * But if caller specified an address hint and we allocated area there + * successfully, respect that as before. + */ + if (uaddr == addr) + return addr; + + hpage_size = HPAGE_PMD_SIZE; + if (shmem_huge != SHMEM_HUGE_FORCE) { + struct super_block *sb; + unsigned long __maybe_unused hpage_orders; + int order = 0; + + if (file) { + VM_BUG_ON(file->f_op != &shmem_file_operations); + sb = file_inode(file)->i_sb; + } else { + /* + * Called directly from mm/mmap.c, or drivers/char/mem.c + * for "/dev/zero", to create a shared anonymous object. + */ + if (IS_ERR(shm_mnt)) + return addr; + sb = shm_mnt->mnt_sb; + + /* + * Find the highest mTHP order used for anonymous shmem to + * provide a suitable alignment address. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + hpage_orders = READ_ONCE(huge_shmem_orders_always); + hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); + hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); + + if (hpage_orders > 0) { + order = highest_order(hpage_orders); + hpage_size = PAGE_SIZE << order; + } +#endif + } + if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) + return addr; + } + + if (len < hpage_size) + return addr; + + offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); + if (offset && offset + len < 2 * hpage_size) + return addr; + if ((addr & (hpage_size - 1)) == offset) + return addr; + + inflated_len = len + hpage_size - PAGE_SIZE; + if (inflated_len > TASK_SIZE) + return addr; + if (inflated_len < len) + return addr; + + inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags); + if (IS_ERR_VALUE(inflated_addr)) + return addr; + if (inflated_addr & ~PAGE_MASK) + return addr; + + inflated_offset = inflated_addr & (hpage_size - 1); + inflated_addr += offset - inflated_offset; + if (inflated_offset > offset) + inflated_addr += hpage_size; + + if (inflated_addr > TASK_SIZE - len) + return addr; + return inflated_addr; +} + #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { @@ -1319,487 +2881,792 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; + /* + * Bias interleave by inode number to distribute better across nodes; + * but this interface is independent of which page order is used, so + * supplies only that bias, letting caller apply the offset (adjusted + * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). + */ + *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } -#endif -int shmem_lock(struct file *file, int lock, struct user_struct *user) +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + struct mempolicy *mpol; + + /* Bias interleave by inode number to distribute better across nodes */ + *ilx = info->vfs_inode.i_ino + (index >> order); + + mpol = mpol_shared_policy_lookup(&info->policy, index); + return mpol ? mpol : get_task_policy(current); +} +#else +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} +#endif /* CONFIG_NUMA */ + +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; - spin_lock(&info->lock); - if (lock && !(info->flags & VM_LOCKED)) { - if (!user_shm_lock(inode->i_size, user)) + /* + * What serializes the accesses to info->flags? + * ipc_lock_object() when called from shmctl_do_lock(), + * no serialization needed when called from shm_destroy(). + */ + if (lock && !(info->flags & SHMEM_F_LOCKED)) { + if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; - info->flags |= VM_LOCKED; + info->flags |= SHMEM_F_LOCKED; mapping_set_unevictable(file->f_mapping); } - if (!lock && (info->flags & VM_LOCKED) && user) { - user_shm_unlock(inode->i_size, user); - info->flags &= ~VM_LOCKED; + if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) { + user_shm_unlock(inode->i_size, ucounts); + info->flags &= ~SHMEM_F_LOCKED; mapping_clear_unevictable(file->f_mapping); } retval = 0; out_nomem: - spin_unlock(&info->lock); return retval; } -static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +static int shmem_mmap_prepare(struct vm_area_desc *desc) { + struct file *file = desc->file; + struct inode *inode = file_inode(file); + file_accessed(file); - vma->vm_ops = &shmem_vm_ops; + /* This is anonymous shared memory if it is unlinked at the time of mmap */ + if (inode->i_nlink) + desc->vm_ops = &shmem_vm_ops; + else + desc->vm_ops = &shmem_anon_vm_ops; return 0; } -static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) +static int shmem_file_open(struct inode *inode, struct file *file) +{ + file->f_mode |= FMODE_CAN_ODIRECT; + return generic_file_open(inode, file); +} + +#ifdef CONFIG_TMPFS_XATTR +static int shmem_initxattrs(struct inode *, const struct xattr *, void *); + +#if IS_ENABLED(CONFIG_UNICODE) +/* + * shmem_inode_casefold_flags - Deal with casefold file attribute flag + * + * The casefold file attribute needs some special checks. I can just be added to + * an empty dir, and can't be removed from a non-empty dir. + */ +static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, + struct dentry *dentry, unsigned int *i_flags) +{ + unsigned int old = inode->i_flags; + struct super_block *sb = inode->i_sb; + + if (fsflags & FS_CASEFOLD_FL) { + if (!(old & S_CASEFOLD)) { + if (!sb->s_encoding) + return -EOPNOTSUPP; + + if (!S_ISDIR(inode->i_mode)) + return -ENOTDIR; + + if (dentry && !simple_empty(dentry)) + return -ENOTEMPTY; + } + + *i_flags = *i_flags | S_CASEFOLD; + } else if (old & S_CASEFOLD) { + if (dentry && !simple_empty(dentry)) + return -ENOTEMPTY; + } + + return 0; +} +#else +static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, + struct dentry *dentry, unsigned int *i_flags) +{ + if (fsflags & FS_CASEFOLD_FL) + return -EOPNOTSUPP; + + return 0; +} +#endif + +/* + * chattr's fsflags are unrelated to extended attributes, + * but tmpfs has chosen to enable them under the same config option. + */ +static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) +{ + unsigned int i_flags = 0; + int ret; + + ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); + if (ret) + return ret; + + if (fsflags & FS_NOATIME_FL) + i_flags |= S_NOATIME; + if (fsflags & FS_APPEND_FL) + i_flags |= S_APPEND; + if (fsflags & FS_IMMUTABLE_FL) + i_flags |= S_IMMUTABLE; + /* + * But FS_NODUMP_FL does not require any action in i_flags. + */ + inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD); + + return 0; +} +#else +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) +{ +} +#define shmem_initxattrs NULL +#endif + +static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) +{ + return &SHMEM_I(inode)->dir_offsets; +} + +static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, + struct inode *dir, umode_t mode, + dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + ino_t ino; + int err; - if (shmem_reserve_inode(sb)) - return NULL; + err = shmem_reserve_inode(sb, &ino); + if (err) + return ERR_PTR(err); inode = new_inode(sb); - if (inode) { - inode->i_ino = get_next_ino(); - inode_init_owner(inode, dir, mode); - inode->i_blocks = 0; - inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_generation = get_seconds(); - info = SHMEM_I(inode); - memset(info, 0, (char *)inode - (char *)info); - spin_lock_init(&info->lock); - info->flags = flags & VM_NORESERVE; - INIT_LIST_HEAD(&info->swaplist); - simple_xattrs_init(&info->xattrs); - cache_no_acl(inode); - - switch (mode & S_IFMT) { - default: - inode->i_op = &shmem_special_inode_operations; - init_special_inode(inode, mode, dev); - break; - case S_IFREG: - inode->i_mapping->a_ops = &shmem_aops; - inode->i_op = &shmem_inode_operations; - inode->i_fop = &shmem_file_operations; - mpol_shared_policy_init(&info->policy, - shmem_get_sbmpol(sbinfo)); - break; - case S_IFDIR: - inc_nlink(inode); - /* Some things misbehave if size == 0 on a directory */ - inode->i_size = 2 * BOGO_DIRENT_SIZE; - inode->i_op = &shmem_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - break; - case S_IFLNK: + if (!inode) { + shmem_free_inode(sb, 0); + return ERR_PTR(-ENOSPC); + } + + inode->i_ino = ino; + inode_init_owner(idmap, inode, dir, mode); + inode->i_blocks = 0; + simple_inode_init_ts(inode); + inode->i_generation = get_random_u32(); + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); + spin_lock_init(&info->lock); + atomic_set(&info->stop_eviction, 0); + info->seals = F_SEAL_SEAL; + info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; + info->i_crtime = inode_get_mtime(inode); + info->fsflags = (dir == NULL) ? 0 : + SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; + if (info->fsflags) + shmem_set_inode_flags(inode, info->fsflags, NULL); + INIT_LIST_HEAD(&info->shrinklist); + INIT_LIST_HEAD(&info->swaplist); + simple_xattrs_init(&info->xattrs); + cache_no_acl(inode); + if (sbinfo->noswap) + mapping_set_unevictable(inode->i_mapping); + + /* Don't consider 'deny' for emergencies and 'force' for testing */ + if (sbinfo->huge) + mapping_set_large_folios(inode->i_mapping); + + switch (mode & S_IFMT) { + default: + inode->i_op = &shmem_special_inode_operations; + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_mapping->a_ops = &shmem_aops; + inode->i_op = &shmem_inode_operations; + inode->i_fop = &shmem_file_operations; + mpol_shared_policy_init(&info->policy, + shmem_get_sbmpol(sbinfo)); + break; + case S_IFDIR: + inc_nlink(inode); + /* Some things misbehave if size == 0 on a directory */ + inode->i_size = 2 * BOGO_DIRENT_SIZE; + inode->i_op = &shmem_dir_inode_operations; + inode->i_fop = &simple_offset_dir_operations; + simple_offset_init(shmem_get_offset_ctx(inode)); + break; + case S_IFLNK: + /* + * Must not load anything in the rbtree, + * mpol_free_shared_policy will not be called. + */ + mpol_shared_policy_init(&info->policy, NULL); + break; + } + + lockdep_annotate_inode_mutex_key(inode); + return inode; +} + +#ifdef CONFIG_TMPFS_QUOTA +static struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + int err; + struct inode *inode; + + inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); + if (IS_ERR(inode)) + return inode; + + err = dquot_initialize(inode); + if (err) + goto errout; + + err = dquot_alloc_inode(inode); + if (err) { + dquot_drop(inode); + goto errout; + } + return inode; + +errout: + inode->i_flags |= S_NOQUOTA; + iput(inode); + return ERR_PTR(err); +} +#else +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); +} +#endif /* CONFIG_TMPFS_QUOTA */ + +#ifdef CONFIG_USERFAULTFD +int shmem_mfill_atomic_pte(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + uffd_flags_t flags, + struct folio **foliop) +{ + struct inode *inode = file_inode(dst_vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct address_space *mapping = inode->i_mapping; + gfp_t gfp = mapping_gfp_mask(mapping); + pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + void *page_kaddr; + struct folio *folio; + int ret; + pgoff_t max_off; + + if (shmem_inode_acct_blocks(inode, 1)) { + /* + * We may have got a page, returned -ENOENT triggering a retry, + * and now we find ourselves with -ENOMEM. Release the page, to + * avoid a BUG_ON in our caller. + */ + if (unlikely(*foliop)) { + folio_put(*foliop); + *foliop = NULL; + } + return -ENOMEM; + } + + if (!*foliop) { + ret = -ENOMEM; + folio = shmem_alloc_folio(gfp, 0, info, pgoff); + if (!folio) + goto out_unacct_blocks; + + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { + page_kaddr = kmap_local_folio(folio, 0); /* - * Must not load anything in the rbtree, - * mpol_free_shared_policy will not be called. + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. */ - mpol_shared_policy_init(&info->policy, NULL); - break; + pagefault_disable(); + ret = copy_from_user(page_kaddr, + (const void __user *)src_addr, + PAGE_SIZE); + pagefault_enable(); + kunmap_local(page_kaddr); + + /* fallback to copy_from_user outside mmap_lock */ + if (unlikely(ret)) { + *foliop = folio; + ret = -ENOENT; + /* don't free the page */ + goto out_unacct_blocks; + } + + flush_dcache_folio(folio); + } else { /* ZEROPAGE */ + clear_user_highpage(&folio->page, dst_addr); } - } else - shmem_free_inode(sb); - return inode; + } else { + folio = *foliop; + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + *foliop = NULL; + } + + VM_BUG_ON(folio_test_locked(folio)); + VM_BUG_ON(folio_test_swapbacked(folio)); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + __folio_mark_uptodate(folio); + + ret = -EFAULT; + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(pgoff >= max_off)) + goto out_release; + + ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); + if (ret) + goto out_release; + ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); + if (ret) + goto out_release; + + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, + &folio->page, true, flags); + if (ret) + goto out_delete_from_cache; + + shmem_recalc_inode(inode, 1, 0); + folio_unlock(folio); + return 0; +out_delete_from_cache: + filemap_remove_folio(folio); +out_release: + folio_unlock(folio); + folio_put(folio); +out_unacct_blocks: + shmem_inode_unacct_blocks(inode, 1); + return ret; } +#endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; -#ifdef CONFIG_TMPFS_XATTR -static int shmem_initxattrs(struct inode *, const struct xattr *, void *); -#else -#define shmem_initxattrs NULL -#endif - static int -shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) +shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + struct shmem_inode_info *info = SHMEM_I(inode); + pgoff_t index = pos >> PAGE_SHIFT; + struct folio *folio; + int ret = 0; + + /* i_rwsem is held by caller */ + if (unlikely(info->seals & (F_SEAL_GROW | + F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) + return -EPERM; + if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) + return -EPERM; + } + + if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) && + pos + len > inode->i_size)) + return -EPERM; + + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); + if (ret) + return ret; + + if (folio_contain_hwpoisoned_page(folio)) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } + + *foliop = folio; + return 0; } static int -shmem_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - if (!PageUptodate(page)) { - if (copied < PAGE_CACHE_SIZE) { - unsigned from = pos & (PAGE_CACHE_SIZE - 1); - zero_user_segments(page, 0, from, - from + copied, PAGE_CACHE_SIZE); + if (!folio_test_uptodate(folio)) { + if (copied < folio_size(folio)) { + size_t from = offset_in_folio(folio, pos); + folio_zero_segments(folio, 0, from, + from + copied, folio_size(folio)); } - SetPageUptodate(page); + folio_mark_uptodate(folio); } - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); return copied; } -static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) +static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct inode *inode = file_inode(filp); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; - enum sgp_type sgp = SGP_READ; - - /* - * Might this read be for a stacking filesystem? Then when reading - * holes of a sparse file, we actually need to allocate those pages, - * and even mark them dirty, so it cannot exceed the max_blocks limit. - */ - if (segment_eq(get_fs(), KERNEL_DS)) - sgp = SGP_DIRTY; - - index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + int error = 0; + ssize_t retval = 0; for (;;) { + struct folio *folio = NULL; struct page *page = NULL; - pgoff_t end_index; unsigned long nr, ret; - loff_t i_size = i_size_read(inode); + loff_t end_offset, i_size = i_size_read(inode); + bool fallback_page_copy = false; + size_t fsize; - end_index = i_size >> PAGE_CACHE_SHIFT; - if (index > end_index) + if (unlikely(iocb->ki_pos >= i_size)) break; - if (index == end_index) { - nr = i_size & ~PAGE_CACHE_MASK; - if (nr <= offset) - break; - } - desc->error = shmem_getpage(inode, index, &page, sgp, NULL); - if (desc->error) { - if (desc->error == -EINVAL) - desc->error = 0; + index = iocb->ki_pos >> PAGE_SHIFT; + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); + if (error) { + if (error == -EINVAL) + error = 0; break; } - if (page) - unlock_page(page); + if (folio) { + folio_unlock(folio); + + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { + folio_put(folio); + error = -EIO; + break; + } + + if (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio)) + fallback_page_copy = true; + } /* * We must evaluate after, since reads (unlike writes) - * are called without i_mutex protection against truncate + * are called without i_rwsem protection against truncate */ - nr = PAGE_CACHE_SIZE; i_size = i_size_read(inode); - end_index = i_size >> PAGE_CACHE_SHIFT; - if (index == end_index) { - nr = i_size & ~PAGE_CACHE_MASK; - if (nr <= offset) { - if (page) - page_cache_release(page); - break; - } + if (unlikely(iocb->ki_pos >= i_size)) { + if (folio) + folio_put(folio); + break; } - nr -= offset; + end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); + if (folio && likely(!fallback_page_copy)) + fsize = folio_size(folio); + else + fsize = PAGE_SIZE; + offset = iocb->ki_pos & (fsize - 1); + nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); - if (page) { + if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + if (mapping_writably_mapped(mapping)) { + if (likely(!fallback_page_copy)) + flush_dcache_folio(folio); + else + flush_dcache_page(page); + } + /* - * Mark the page accessed if we read the beginning. + * Mark the folio accessed if we read the beginning. */ if (!offset) - mark_page_accessed(page); + folio_mark_accessed(folio); + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + */ + if (likely(!fallback_page_copy)) + ret = copy_folio_to_iter(folio, offset, nr, to); + else + ret = copy_page_to_iter(page, offset, nr, to); + folio_put(folio); + } else if (user_backed_iter(to)) { + /* + * Copy to user tends to be so well optimized, but + * clear_user() not so much, that it is noticeably + * faster to copy the zero page instead of clearing. + */ + ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); } else { - page = ZERO_PAGE(0); - page_cache_get(page); + /* + * But submitting the same page twice in a row to + * splice() - or others? - can result in confusion: + * so don't attempt that optimization on pipes etc. + */ + ret = iov_iter_zero(nr, to); } - /* - * Ok, we have the page, and it's up-to-date, so - * now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). - */ - ret = actor(desc, page, offset, nr); - offset += ret; - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; + retval += ret; + iocb->ki_pos += ret; - page_cache_release(page); - if (ret != nr || !desc->count) + if (!iov_iter_count(to)) break; - + if (ret < nr) { + error = -EFAULT; + break; + } cond_resched(); } - *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; - file_accessed(filp); + file_accessed(file); + return retval ? retval : error; } -static ssize_t shmem_file_aio_read(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) +static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg; - size_t count; - loff_t *ppos = &iocb->ki_pos; - - retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (retval) - return retval; - - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + ret = file_remove_privs(file); + if (ret) + goto unlock; + ret = file_update_time(file); + if (ret) + goto unlock; + ret = generic_perform_write(iocb, from); +unlock: + inode_unlock(inode); + return ret; +} - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_shmem_file_read(filp, ppos, &desc, file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } - return retval; +static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return true; } -static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static void zero_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { - struct address_space *mapping = in->f_mapping; - struct inode *inode = mapping->host; - unsigned int loff, nr_pages, req_pages; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct page *page; - pgoff_t index, end_index; - loff_t isize, left; - int error, page_nr; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, - .ops = &page_cache_pipe_buf_ops, - .spd_release = spd_release_page, - }; +} - isize = i_size_read(inode); - if (unlikely(*ppos >= isize)) - return 0; +static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return false; +} - left = isize - *ppos; - if (unlikely(left < len)) - len = left; +static const struct pipe_buf_operations zero_pipe_buf_ops = { + .release = zero_pipe_buf_release, + .try_steal = zero_pipe_buf_try_steal, + .get = zero_pipe_buf_get, +}; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; +static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, + loff_t fpos, size_t size) +{ + size_t offset = fpos & ~PAGE_MASK; - index = *ppos >> PAGE_CACHE_SHIFT; - loff = *ppos & ~PAGE_CACHE_MASK; - req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, pipe->buffers); + size = min_t(size_t, size, PAGE_SIZE - offset); - spd.nr_pages = find_get_pages_contig(mapping, index, - nr_pages, spd.pages); - index += spd.nr_pages; - error = 0; + if (!pipe_is_full(pipe)) { + struct pipe_buffer *buf = pipe_head_buf(pipe); - while (spd.nr_pages < nr_pages) { - error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); - if (error) - break; - unlock_page(page); - spd.pages[spd.nr_pages++] = page; - index++; + *buf = (struct pipe_buffer) { + .ops = &zero_pipe_buf_ops, + .page = ZERO_PAGE(0), + .offset = offset, + .len = size, + }; + pipe->head++; } - index = *ppos >> PAGE_CACHE_SHIFT; - nr_pages = spd.nr_pages; - spd.nr_pages = 0; - - for (page_nr = 0; page_nr < nr_pages; page_nr++) { - unsigned int this_len; + return size; +} - if (!len) - break; +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + struct address_space *mapping = inode->i_mapping; + struct folio *folio = NULL; + size_t total_spliced = 0, used, npages, n, part; + loff_t isize; + int error = 0; - this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); - page = spd.pages[page_nr]; + /* Work out how much data we can actually add into the pipe */ + used = pipe_buf_usage(pipe); + npages = max_t(ssize_t, pipe->max_usage - used, 0); + len = min_t(size_t, len, npages * PAGE_SIZE); - if (!PageUptodate(page) || page->mapping != mapping) { - error = shmem_getpage(inode, index, &page, - SGP_CACHE, NULL); - if (error) - break; - unlock_page(page); - page_cache_release(spd.pages[page_nr]); - spd.pages[page_nr] = page; - } + do { + bool fallback_page_splice = false; + struct page *page = NULL; + pgoff_t index; + size_t size; - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!isize || index > end_index)) + if (*ppos >= i_size_read(inode)) break; - if (end_index == index) { - unsigned int plen; + index = *ppos >> PAGE_SHIFT; + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); + if (error) { + if (error == -EINVAL) + error = 0; + break; + } + if (folio) { + folio_unlock(folio); - plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; - if (plen <= loff) + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { + error = -EIO; break; + } - this_len = min(this_len, plen - loff); - len = this_len; + if (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio)) + fallback_page_splice = true; } - spd.partial[page_nr].offset = loff; - spd.partial[page_nr].len = this_len; - len -= this_len; - loff = 0; - spd.nr_pages++; - index++; - } + /* + * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + isize = i_size_read(inode); + if (unlikely(*ppos >= isize)) + break; + /* + * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned + * pages. + */ + size = len; + if (unlikely(fallback_page_splice)) { + size_t offset = *ppos & ~PAGE_MASK; - while (page_nr < nr_pages) - page_cache_release(spd.pages[page_nr++]); + size = umin(size, PAGE_SIZE - offset); + } + part = min_t(loff_t, isize - *ppos, size); - if (spd.nr_pages) - error = splice_to_pipe(pipe, &spd); + if (folio) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) { + if (likely(!fallback_page_splice)) + flush_dcache_folio(folio); + else + flush_dcache_page(page); + } + folio_mark_accessed(folio); + /* + * Ok, we have the page, and it's up-to-date, so we can + * now splice it into the pipe. + */ + n = splice_folio_into_pipe(pipe, folio, *ppos, part); + folio_put(folio); + folio = NULL; + } else { + n = splice_zeropage_into_pipe(pipe, *ppos, part); + } - splice_shrink_spd(&spd); + if (!n) + break; + len -= n; + total_spliced += n; + *ppos += n; + in->f_ra.prev_pos = *ppos; + if (pipe_is_full(pipe)) + break; - if (error > 0) { - *ppos += error; - file_accessed(in); - } - return error; -} + cond_resched(); + } while (len); -/* - * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. - */ -static pgoff_t shmem_seek_hole_data(struct address_space *mapping, - pgoff_t index, pgoff_t end, int whence) -{ - struct page *page; - struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; - bool done = false; - int i; + if (folio) + folio_put(folio); - pagevec_init(&pvec, 0); - pvec.nr = 1; /* start small: we may be there already */ - while (!done) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - pvec.nr, pvec.pages, indices); - if (!pvec.nr) { - if (whence == SEEK_DATA) - index = end; - break; - } - for (i = 0; i < pvec.nr; i++, index++) { - if (index < indices[i]) { - if (whence == SEEK_HOLE) { - done = true; - break; - } - index = indices[i]; - } - page = pvec.pages[i]; - if (page && !radix_tree_exceptional_entry(page)) { - if (!PageUptodate(page)) - page = NULL; - } - if (index >= end || - (page && whence == SEEK_DATA) || - (!page && whence == SEEK_HOLE)) { - done = true; - break; - } - } - shmem_deswap_pagevec(&pvec); - pagevec_release(&pvec); - pvec.nr = PAGEVEC_SIZE; - cond_resched(); - } - return index; + file_accessed(in); + return total_spliced ? total_spliced : error; } static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - pgoff_t start, end; - loff_t new_offset; if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); - mutex_lock(&inode->i_mutex); - /* We're holding i_mutex so we can access i_size directly */ - if (offset < 0) - offset = -EINVAL; - else if (offset >= inode->i_size) - offset = -ENXIO; - else { - start = offset >> PAGE_CACHE_SHIFT; - end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - new_offset = shmem_seek_hole_data(mapping, start, end, whence); - new_offset <<= PAGE_CACHE_SHIFT; - if (new_offset > offset) { - if (new_offset < inode->i_size) - offset = new_offset; - else if (whence == SEEK_DATA) - offset = -ENXIO; - else - offset = inode->i_size; - } - } - - offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); - mutex_unlock(&inode->i_mutex); + return -ENXIO; + + inode_lock(inode); + /* We're holding i_rwsem so we can access i_size directly */ + offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); + if (offset >= 0) + offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); + inode_unlock(inode); return offset; } @@ -1808,22 +3675,51 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, { struct inode *inode = file_inode(file); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; - pgoff_t start, index, end; + pgoff_t start, index, end, undo_fallocend; int error; - mutex_lock(&inode->i_mutex); + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + if (info->flags & SHMEM_F_MAPPING_FROZEN) { + error = -EPERM; + goto out; + } if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + + /* protected by i_rwsem */ + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { + error = -EPERM; + goto out; + } + + shmem_falloc.waitq = &shmem_falloc_waitq; + shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; + shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ + + spin_lock(&inode->i_lock); + inode->i_private = NULL; + wake_up_all(&shmem_falloc_waitq); + WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); + spin_unlock(&inode->i_lock); error = 0; goto out; } @@ -1833,14 +3729,20 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (error) goto out; - start = offset >> PAGE_CACHE_SHIFT; - end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + + start = offset >> PAGE_SHIFT; + end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { error = -ENOSPC; goto out; } + shmem_falloc.waitq = NULL; shmem_falloc.start = start; shmem_falloc.next = start; shmem_falloc.nr_falloced = 0; @@ -1849,58 +3751,84 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); - for (index = start; index < end; index++) { - struct page *page; + /* + * info->fallocend is only relevant when huge pages might be + * involved: to prevent split_huge_page() freeing fallocated + * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. + */ + undo_fallocend = info->fallocend; + if (info->fallocend < end) + info->fallocend = end; + + for (index = start; index < end; ) { + struct folio *folio; /* - * Good, the fallocate(2) manpage permits EINTR: we may have - * been interrupted because we are using up too much memory. + * Check for fatal signal so that we abort early in OOM + * situations. We don't want to abort in case of non-fatal + * signals as large fallocate can take noticeable time and + * e.g. periodic timers may result in fallocate constantly + * restarting. */ - if (signal_pending(current)) + if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_getpage(inode, index, &page, SGP_FALLOC, - NULL); + error = shmem_get_folio(inode, index, offset + len, + &folio, SGP_FALLOC); if (error) { - /* Remove the !PageUptodate pages we added */ - shmem_undo_range(inode, - (loff_t)start << PAGE_CACHE_SHIFT, - (loff_t)index << PAGE_CACHE_SHIFT, true); + info->fallocend = undo_fallocend; + /* Remove the !uptodate folios we added */ + if (index > start) { + shmem_undo_range(inode, + (loff_t)start << PAGE_SHIFT, + ((loff_t)index << PAGE_SHIFT) - 1, true); + } goto undone; } /* - * Inform shmem_writepage() how far we have reached. + * Here is a more important optimization than it appears: + * a second SGP_FALLOC on the same large folio will clear it, + * making it uptodate and un-undoable if we fail later. + */ + index = folio_next_index(folio); + /* Beware 32-bit wraparound */ + if (!index) + index--; + + /* + * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ - shmem_falloc.next++; - if (!PageUptodate(page)) - shmem_falloc.nr_falloced++; + if (!folio_test_uptodate(folio)) + shmem_falloc.nr_falloced += index - shmem_falloc.next; + shmem_falloc.next = index; /* - * If !PageUptodate, leave it that way so that freeable pages + * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. - * But set_page_dirty so that memory pressure will swap rather - * than free the pages we are allocating (and SGP_CACHE pages + * But mark it dirty so that memory pressure will swap rather + * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); cond_resched(); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); - inode->i_ctime = CURRENT_TIME; undone: spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); out: - mutex_unlock(&inode->i_mutex); + if (!error) + file_modified(file); + inode_unlock(inode); return error; } @@ -1909,7 +3837,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; - buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; @@ -1919,9 +3847,12 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; - buf->f_ffree = sbinfo->free_inodes; + buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; } /* else leave those fields 0 like simple_statfs */ + + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); + return 0; } @@ -1929,124 +3860,143 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) * File creation. Allocate an inode, and we're done.. */ static int -shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; - int error = -ENOSPC; + int error; - inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); - if (inode) { -#ifdef CONFIG_TMPFS_POSIX_ACL - error = generic_acl_init(inode, dir); - if (error) { - iput(inode); - return error; - } -#endif - error = security_inode_init_security(inode, dir, - &dentry->d_name, - shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - } + if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) + return -EINVAL; - error = 0; - dir->i_size += BOGO_DIRENT_SIZE; - dir->i_ctime = dir->i_mtime = CURRENT_TIME; - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - } + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + error = security_inode_init_security(inode, dir, &dentry->d_name, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + + error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (error) + goto out_iput; + + dir->i_size += BOGO_DIRENT_SIZE; + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + inode_inc_iversion(dir); + + d_make_persistent(dentry, inode); + return error; + +out_iput: + iput(inode); return error; } static int -shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, + struct file *file, umode_t mode) { struct inode *inode; - int error = -ENOSPC; + int error; - inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); - if (inode) { - error = security_inode_init_security(inode, dir, - NULL, - shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - } -#ifdef CONFIG_TMPFS_POSIX_ACL - error = generic_acl_init(inode, dir); - if (error) { - iput(inode); - return error; - } -#else - error = 0; -#endif - d_tmpfile(dentry, inode); + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto err_out; } + error = security_inode_init_security(inode, dir, NULL, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + d_tmpfile(file, inode); + +err_out: + return finish_open_simple(file, error); +out_iput: + iput(inode); return error; } -static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; - if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) - return error; + error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); + if (error) + return ERR_PTR(error); inc_nlink(dir); - return 0; + return NULL; } -static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return shmem_mknod(dir, dentry, mode | S_IFREG, 0); + return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } /* * Link a file.. */ -static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +static int shmem_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int ret; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. + * But if an O_TMPFILE file is linked into the tmpfs, the + * first link must skip that, to get the accounting right. */ - ret = shmem_reserve_inode(inode->i_sb); - if (ret) - goto out; + if (inode->i_nlink) { + ret = shmem_reserve_inode(inode->i_sb, NULL); + if (ret) + return ret; + } + + ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (ret) { + if (inode->i_nlink) + shmem_free_inode(inode->i_sb, 0); + return ret; + } dir->i_size += BOGO_DIRENT_SIZE; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - inc_nlink(inode); - ihold(inode); /* New dentry reference */ - dget(dentry); /* Extra pinning count for the created dentry */ - d_instantiate(dentry, inode); -out: - return ret; + inode_inc_iversion(dir); + return simple_link(old_dentry, dir, dentry); } static int shmem_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) - shmem_free_inode(inode->i_sb); + shmem_free_inode(inode->i_sb, 0); + + simple_offset_remove(shmem_get_offset_ctx(dir), dentry); dir->i_size -= BOGO_DIRENT_SIZE; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - drop_nlink(inode); - dput(dentry); /* Undo the count from "create" - this does all the work */ + inode_inc_iversion(dir); + simple_unlink(dir, dentry); + + /* + * For now, VFS can't deal with case-insensitive negative dentries, so + * we invalidate them + */ + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_invalidate(dentry); + return 0; } @@ -2055,29 +4005,79 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); drop_nlink(dir); return shmem_unlink(dir, dentry); } +static int shmem_whiteout(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry) +{ + struct dentry *whiteout; + int error; + + whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); + if (!whiteout) + return -ENOMEM; + + error = shmem_mknod(idmap, old_dir, whiteout, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); + dput(whiteout); + if (error) + return error; + + /* + * Cheat and hash the whiteout while the old dentry is still in + * place, instead of playing games with FS_RENAME_DOES_D_MOVE. + * + * d_lookup() will consistently find one of them at this point, + * not sure which one, but that isn't even important. + */ + d_rehash(whiteout); + return 0; +} + /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ -static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) +static int shmem_rename2(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); + int error; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + return simple_offset_rename_exchange(old_dir, old_dentry, + new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; - if (new_dentry->d_inode) { + if (flags & RENAME_WHITEOUT) { + error = shmem_whiteout(idmap, old_dir, old_dentry); + if (error) + return error; + } + + error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); + if (error) + return error; + + if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); - if (they_are_dirs) + if (they_are_dirs) { + drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); + } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); @@ -2085,98 +4085,150 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct old_dir->i_size -= BOGO_DIRENT_SIZE; new_dir->i_size += BOGO_DIRENT_SIZE; - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - inode->i_ctime = CURRENT_TIME; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); return 0; } -static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *symname) { int error; int len; struct inode *inode; - struct page *page; - char *kaddr; - struct shmem_inode_info *info; + struct folio *folio; + char *link; len = strlen(symname) + 1; - if (len > PAGE_CACHE_SIZE) + if (len > PAGE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); - if (!inode) - return -ENOSPC; + inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, + VM_NORESERVE); + if (IS_ERR(inode)) + return PTR_ERR(inode); error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - error = 0; - } + if (error && error != -EOPNOTSUPP) + goto out_iput; + + error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (error) + goto out_iput; - info = SHMEM_I(inode); inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - info->symlink = kmemdup(symname, len, GFP_KERNEL); - if (!info->symlink) { - iput(inode); - return -ENOMEM; + link = kmemdup(symname, len, GFP_KERNEL); + if (!link) { + error = -ENOMEM; + goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; + inode_set_cached_link(inode, link, len - 1); } else { - error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); - if (error) { - iput(inode); - return error; - } + inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; + error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); + if (error) + goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; - kaddr = kmap_atomic(page); - memcpy(kaddr, symname, len); - kunmap_atomic(kaddr); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); + memcpy(folio_address(folio), symname, len); + folio_mark_uptodate(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; - dir->i_ctime = dir->i_mtime = CURRENT_TIME; - d_instantiate(dentry, inode); - dget(dentry); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + inode_inc_iversion(dir); + d_make_persistent(dentry, inode); return 0; -} -static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); - return NULL; +out_remove_offset: + simple_offset_remove(shmem_get_offset_ctx(dir), dentry); +out_iput: + iput(inode); + return error; } -static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) +static void shmem_put_link(void *arg) { - struct page *page = NULL; - int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); - nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); - if (page) - unlock_page(page); - return page; + folio_mark_accessed(arg); + folio_put(arg); } -static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) { - if (!IS_ERR(nd_get_link(nd))) { - struct page *page = cookie; - kunmap(page); - mark_page_accessed(page); - page_cache_release(page); + struct folio *folio = NULL; + int error; + + if (!dentry) { + folio = filemap_get_folio(inode->i_mapping, 0); + if (IS_ERR(folio)) + return ERR_PTR(-ECHILD); + if (PageHWPoison(folio_page(folio, 0)) || + !folio_test_uptodate(folio)) { + folio_put(folio); + return ERR_PTR(-ECHILD); + } + } else { + error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); + if (error) + return ERR_PTR(error); + if (!folio) + return ERR_PTR(-ECHILD); + if (PageHWPoison(folio_page(folio, 0))) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(-ECHILD); + } + folio_unlock(folio); } + set_delayed_call(done, shmem_put_link, folio); + return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR + +static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + + fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); + + return 0; +} + +static int shmem_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct file_kattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); + int ret, flags; + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) + return -EOPNOTSUPP; + + flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + (fa->flags & SHMEM_FL_USER_MODIFIABLE); + + ret = shmem_set_inode_flags(inode, flags, dentry); + + if (ret) + return ret; + + info->fsflags = flags; + + inode_set_ctime_current(inode); + inode_inc_iversion(inode); + return 0; +} + /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs @@ -2188,25 +4240,43 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, - const struct xattr *xattr_array, - void *fs_info) + const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; struct simple_xattr *new_xattr; + size_t ispace = 0; size_t len; + if (sbinfo->max_inodes) { + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + ispace += simple_xattr_space(xattr->name, + xattr->value_len + XATTR_SECURITY_PREFIX_LEN); + } + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_ispace < ispace) + ispace = 0; + else + sbinfo->free_ispace -= ispace; + raw_spin_unlock(&sbinfo->stat_lock); + if (!ispace) + return -ENOSPC; + } + } + for (xattr = xattr_array; xattr->name != NULL; xattr++) { new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) - return -ENOMEM; + break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { - kfree(new_xattr); - return -ENOMEM; + kvfree(new_xattr); + break; } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, @@ -2214,128 +4284,122 @@ static int shmem_initxattrs(struct inode *inode, memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); - simple_xattr_list_add(&info->xattrs, new_xattr); + simple_xattr_add(&info->xattrs, new_xattr); } - return 0; -} - -static const struct xattr_handler *shmem_xattr_handlers[] = { -#ifdef CONFIG_TMPFS_POSIX_ACL - &generic_acl_access_handler, - &generic_acl_default_handler, -#endif - NULL -}; - -static int shmem_xattr_validate(const char *name) -{ - struct { const char *prefix; size_t len; } arr[] = { - { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, - { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } - }; - int i; - - for (i = 0; i < ARRAY_SIZE(arr); i++) { - size_t preflen = arr[i].len; - if (strncmp(name, arr[i].prefix, preflen) == 0) { - if (!name[preflen]) - return -EINVAL; - return 0; + if (xattr->name != NULL) { + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_ispace += ispace; + raw_spin_unlock(&sbinfo->stat_lock); } + simple_xattrs_free(&info->xattrs, NULL); + return -ENOMEM; } - return -EOPNOTSUPP; + + return 0; } -static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) +static int shmem_xattr_handler_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); - int err; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_getxattr(dentry, name, buffer, size); - - err = shmem_xattr_validate(name); - if (err) - return err; + struct shmem_inode_info *info = SHMEM_I(inode); + name = xattr_full_name(handler, name); return simple_xattr_get(&info->xattrs, name, buffer, size); } -static int shmem_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int shmem_xattr_handler_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); - int err; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_setxattr(dentry, name, value, size, flags); - - err = shmem_xattr_validate(name); - if (err) - return err; + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct simple_xattr *old_xattr; + size_t ispace = 0; + + name = xattr_full_name(handler, name); + if (value && sbinfo->max_inodes) { + ispace = simple_xattr_space(name, size); + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_ispace < ispace) + ispace = 0; + else + sbinfo->free_ispace -= ispace; + raw_spin_unlock(&sbinfo->stat_lock); + if (!ispace) + return -ENOSPC; + } - return simple_xattr_set(&info->xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); + if (!IS_ERR(old_xattr)) { + ispace = 0; + if (old_xattr && sbinfo->max_inodes) + ispace = simple_xattr_space(old_xattr->name, + old_xattr->size); + simple_xattr_free(old_xattr); + old_xattr = NULL; + inode_set_ctime_current(inode); + inode_inc_iversion(inode); + } + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_ispace += ispace; + raw_spin_unlock(&sbinfo->stat_lock); + } + return PTR_ERR(old_xattr); } -static int shmem_removexattr(struct dentry *dentry, const char *name) -{ - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); - int err; +static const struct xattr_handler shmem_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_removexattr(dentry, name); +static const struct xattr_handler shmem_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; - err = shmem_xattr_validate(name); - if (err) - return err; +static const struct xattr_handler shmem_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; - return simple_xattr_remove(&info->xattrs, name); -} +static const struct xattr_handler * const shmem_xattr_handlers[] = { + &shmem_security_xattr_handler, + &shmem_trusted_xattr_handler, + &shmem_user_xattr_handler, + NULL +}; static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); - return simple_xattr_list(&info->xattrs, buffer, size); + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { - .readlink = generic_readlink, - .follow_link = shmem_follow_short_symlink, + .getattr = shmem_getattr, + .setattr = shmem_setattr, + .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, #endif }; static const struct inode_operations shmem_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = shmem_follow_link, - .put_link = shmem_put_link, + .getattr = shmem_getattr, + .setattr = shmem_setattr, + .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, #endif }; @@ -2352,6 +4416,14 @@ static int shmem_match(struct inode *ino, void *vfh) return ino->i_ino == inum && fh[0] == ino->i_generation; } +/* Find any alias of inode, but prefer a hashed alias */ +static struct dentry *shmem_find_alias(struct inode *inode) +{ + struct dentry *alias = d_find_alias(inode); + + return alias ?: d_find_any_alias(inode); +} + static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -2368,7 +4440,7 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { - dentry = d_find_alias(inode); + dentry = shmem_find_alias(inode); iput(inode); } @@ -2411,164 +4483,427 @@ static const struct export_operations shmem_export_ops = { .fh_to_dentry = shmem_fh_to_dentry, }; -static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, - bool remount) +enum shmem_param { + Opt_gid, + Opt_huge, + Opt_mode, + Opt_mpol, + Opt_nr_blocks, + Opt_nr_inodes, + Opt_size, + Opt_uid, + Opt_inode32, + Opt_inode64, + Opt_noswap, + Opt_quota, + Opt_usrquota, + Opt_grpquota, + Opt_usrquota_block_hardlimit, + Opt_usrquota_inode_hardlimit, + Opt_grpquota_block_hardlimit, + Opt_grpquota_inode_hardlimit, + Opt_casefold_version, + Opt_casefold, + Opt_strict_encoding, +}; + +static const struct constant_table shmem_param_enums_huge[] = { + {"never", SHMEM_HUGE_NEVER }, + {"always", SHMEM_HUGE_ALWAYS }, + {"within_size", SHMEM_HUGE_WITHIN_SIZE }, + {"advise", SHMEM_HUGE_ADVISE }, + {} +}; + +const struct fs_parameter_spec shmem_fs_parameters[] = { + fsparam_gid ("gid", Opt_gid), + fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), + fsparam_u32oct("mode", Opt_mode), + fsparam_string("mpol", Opt_mpol), + fsparam_string("nr_blocks", Opt_nr_blocks), + fsparam_string("nr_inodes", Opt_nr_inodes), + fsparam_string("size", Opt_size), + fsparam_uid ("uid", Opt_uid), + fsparam_flag ("inode32", Opt_inode32), + fsparam_flag ("inode64", Opt_inode64), + fsparam_flag ("noswap", Opt_noswap), +#ifdef CONFIG_TMPFS_QUOTA + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), + fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), + fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), + fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), +#endif + fsparam_string("casefold", Opt_casefold_version), + fsparam_flag ("casefold", Opt_casefold), + fsparam_flag ("strict_encoding", Opt_strict_encoding), + {} +}; + +#if IS_ENABLED(CONFIG_UNICODE) +static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, + bool latest_version) { - char *this_char, *value, *rest; - struct mempolicy *mpol = NULL; - uid_t uid; - gid_t gid; + struct shmem_options *ctx = fc->fs_private; + int version = UTF8_LATEST; + struct unicode_map *encoding; + char *version_str = param->string + 5; + + if (!latest_version) { + if (strncmp(param->string, "utf8-", 5)) + return invalfc(fc, "Only UTF-8 encodings are supported " + "in the format: utf8-<version number>"); + + version = utf8_parse_version(version_str); + if (version < 0) + return invalfc(fc, "Invalid UTF-8 version: %s", version_str); + } - while (options != NULL) { - this_char = options; - for (;;) { - /* - * NUL-terminate this option: unfortunately, - * mount options form a comma-separated list, - * but mpol's nodelist may also contain commas. - */ - options = strchr(options, ','); - if (options == NULL) - break; - options++; - if (!isdigit(*options)) { - options[-1] = '\0'; - break; - } + encoding = utf8_load(version); + + if (IS_ERR(encoding)) { + return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n", + unicode_major(version), unicode_minor(version), + unicode_rev(version)); + } + + pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n", + unicode_major(version), unicode_minor(version), unicode_rev(version)); + + ctx->encoding = encoding; + + return 0; +} +#else +static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, + bool latest_version) +{ + return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); +} +#endif + +static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) +{ + struct shmem_options *ctx = fc->fs_private; + struct fs_parse_result result; + unsigned long long size; + char *rest; + int opt; + kuid_t kuid; + kgid_t kgid; + + opt = fs_parse(fc, shmem_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_size: + size = memparse(param->string, &rest); + if (*rest == '%') { + size <<= PAGE_SHIFT; + size *= totalram_pages(); + do_div(size, 100); + rest++; } - if (!*this_char) - continue; - if ((value = strchr(this_char,'=')) != NULL) { - *value++ = 0; - } else { - printk(KERN_ERR - "tmpfs: No value for mount option '%s'\n", - this_char); - goto error; - } - - if (!strcmp(this_char,"size")) { - unsigned long long size; - size = memparse(value,&rest); - if (*rest == '%') { - size <<= PAGE_SHIFT; - size *= totalram_pages; - do_div(size, 100); - rest++; - } - if (*rest) - goto bad_val; - sbinfo->max_blocks = - DIV_ROUND_UP(size, PAGE_CACHE_SIZE); - } else if (!strcmp(this_char,"nr_blocks")) { - sbinfo->max_blocks = memparse(value, &rest); - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"nr_inodes")) { - sbinfo->max_inodes = memparse(value, &rest); - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"mode")) { - if (remount) - continue; - sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"uid")) { - if (remount) - continue; - uid = simple_strtoul(value, &rest, 0); - if (*rest) - goto bad_val; - sbinfo->uid = make_kuid(current_user_ns(), uid); - if (!uid_valid(sbinfo->uid)) - goto bad_val; - } else if (!strcmp(this_char,"gid")) { - if (remount) - continue; - gid = simple_strtoul(value, &rest, 0); - if (*rest) - goto bad_val; - sbinfo->gid = make_kgid(current_user_ns(), gid); - if (!gid_valid(sbinfo->gid)) - goto bad_val; - } else if (!strcmp(this_char,"mpol")) { - mpol_put(mpol); - mpol = NULL; - if (mpol_parse_str(value, &mpol)) - goto bad_val; - } else { - printk(KERN_ERR "tmpfs: Bad mount option %s\n", - this_char); - goto error; + if (*rest) + goto bad_value; + ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); + ctx->seen |= SHMEM_SEEN_BLOCKS; + break; + case Opt_nr_blocks: + ctx->blocks = memparse(param->string, &rest); + if (*rest || ctx->blocks > LONG_MAX) + goto bad_value; + ctx->seen |= SHMEM_SEEN_BLOCKS; + break; + case Opt_nr_inodes: + ctx->inodes = memparse(param->string, &rest); + if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) + goto bad_value; + ctx->seen |= SHMEM_SEEN_INODES; + break; + case Opt_mode: + ctx->mode = result.uint_32 & 07777; + break; + case Opt_uid: + kuid = result.uid; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, kuid)) + goto bad_value; + + ctx->uid = kuid; + break; + case Opt_gid: + kgid = result.gid; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, kgid)) + goto bad_value; + + ctx->gid = kgid; + break; + case Opt_huge: + ctx->huge = result.uint_32; + if (ctx->huge != SHMEM_HUGE_NEVER && + !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage())) + goto unsupported_parameter; + ctx->seen |= SHMEM_SEEN_HUGE; + break; + case Opt_mpol: + if (IS_ENABLED(CONFIG_NUMA)) { + mpol_put(ctx->mpol); + ctx->mpol = NULL; + if (mpol_parse_str(param->string, &ctx->mpol)) + goto bad_value; + break; } + goto unsupported_parameter; + case Opt_inode32: + ctx->full_inums = false; + ctx->seen |= SHMEM_SEEN_INUMS; + break; + case Opt_inode64: + if (sizeof(ino_t) < 8) { + return invalfc(fc, + "Cannot use inode64 with <64bit inums in kernel\n"); + } + ctx->full_inums = true; + ctx->seen |= SHMEM_SEEN_INUMS; + break; + case Opt_noswap: + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { + return invalfc(fc, + "Turning off swap in unprivileged tmpfs mounts unsupported"); + } + ctx->noswap = true; + break; + case Opt_quota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); + break; + case Opt_usrquota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= QTYPE_MASK_USR; + break; + case Opt_grpquota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= QTYPE_MASK_GRP; + break; + case Opt_usrquota_block_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) + return invalfc(fc, + "User quota block hardlimit too large."); + ctx->qlimits.usrquota_bhardlimit = size; + break; + case Opt_grpquota_block_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) + return invalfc(fc, + "Group quota block hardlimit too large."); + ctx->qlimits.grpquota_bhardlimit = size; + break; + case Opt_usrquota_inode_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_INO_LIMIT) + return invalfc(fc, + "User quota inode hardlimit too large."); + ctx->qlimits.usrquota_ihardlimit = size; + break; + case Opt_grpquota_inode_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_INO_LIMIT) + return invalfc(fc, + "Group quota inode hardlimit too large."); + ctx->qlimits.grpquota_ihardlimit = size; + break; + case Opt_casefold_version: + return shmem_parse_opt_casefold(fc, param, false); + case Opt_casefold: + return shmem_parse_opt_casefold(fc, param, true); + case Opt_strict_encoding: +#if IS_ENABLED(CONFIG_UNICODE) + ctx->strict_encoding = true; + break; +#else + return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); +#endif } - sbinfo->mpol = mpol; return 0; -bad_val: - printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", - value, this_char); -error: - mpol_put(mpol); - return 1; +unsupported_parameter: + return invalfc(fc, "Unsupported parameter '%s'", param->key); +bad_value: + return invalfc(fc, "Bad value for '%s'", param->key); +} + +static char *shmem_next_opt(char **s) +{ + char *sbegin = *s; + char *p; + + if (sbegin == NULL) + return NULL; + + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + for (;;) { + p = strchr(*s, ','); + if (p == NULL) + break; + *s = p + 1; + if (!isdigit(*(p+1))) { + *p = '\0'; + return sbegin; + } + } + *s = NULL; + return sbegin; } -static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) +static int shmem_parse_monolithic(struct fs_context *fc, void *data) { - struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - struct shmem_sb_info config = *sbinfo; - unsigned long inodes; - int error = -EINVAL; + return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); +} - config.mpol = NULL; - if (shmem_parse_options(data, &config, true)) - return error; +/* + * Reconfigure a shmem filesystem. + */ +static int shmem_reconfigure(struct fs_context *fc) +{ + struct shmem_options *ctx = fc->fs_private; + struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); + unsigned long used_isp; + struct mempolicy *mpol = NULL; + const char *err; - spin_lock(&sbinfo->stat_lock); - inodes = sbinfo->max_inodes - sbinfo->free_inodes; - if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) - goto out; - if (config.max_inodes < inodes) + raw_spin_lock(&sbinfo->stat_lock); + used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; + + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { + if (!sbinfo->max_blocks) { + err = "Cannot retroactively limit size"; + goto out; + } + if (percpu_counter_compare(&sbinfo->used_blocks, + ctx->blocks) > 0) { + err = "Too small a size for current use"; + goto out; + } + } + if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { + if (!sbinfo->max_inodes) { + err = "Cannot retroactively limit inodes"; + goto out; + } + if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { + err = "Too few inodes for current use"; + goto out; + } + } + + if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && + sbinfo->next_ino > UINT_MAX) { + err = "Current inum too high to switch to 32-bit inums"; goto out; + } + /* - * Those tests disallow limited->unlimited while any are in use; - * but we must separately disallow unlimited->limited, because - * in that case we have no record of how much is already in use. + * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap" + * counterpart for (re-)enabling swap. */ - if (config.max_blocks && !sbinfo->max_blocks) + if (ctx->noswap && !sbinfo->noswap) { + err = "Cannot disable swap on remount"; goto out; - if (config.max_inodes && !sbinfo->max_inodes) + } + + if (ctx->seen & SHMEM_SEEN_QUOTA && + !sb_any_quota_loaded(fc->root->d_sb)) { + err = "Cannot enable quota on remount"; goto out; + } - error = 0; - sbinfo->max_blocks = config.max_blocks; - sbinfo->max_inodes = config.max_inodes; - sbinfo->free_inodes = config.max_inodes - inodes; +#ifdef CONFIG_TMPFS_QUOTA +#define CHANGED_LIMIT(name) \ + (ctx->qlimits.name## hardlimit && \ + (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) + + if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || + CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { + err = "Cannot change global quota limit on remount"; + goto out; + } +#endif /* CONFIG_TMPFS_QUOTA */ + + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + if (ctx->seen & SHMEM_SEEN_INUMS) + sbinfo->full_inums = ctx->full_inums; + if (ctx->seen & SHMEM_SEEN_BLOCKS) + sbinfo->max_blocks = ctx->blocks; + if (ctx->seen & SHMEM_SEEN_INODES) { + sbinfo->max_inodes = ctx->inodes; + sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; + } /* * Preserve previous mempolicy unless mpol remount option was specified. */ - if (config.mpol) { - mpol_put(sbinfo->mpol); - sbinfo->mpol = config.mpol; /* transfers initial ref */ + if (ctx->mpol) { + mpol = sbinfo->mpol; + sbinfo->mpol = ctx->mpol; /* transfers initial ref */ + ctx->mpol = NULL; } + + if (ctx->noswap) + sbinfo->noswap = true; + + raw_spin_unlock(&sbinfo->stat_lock); + mpol_put(mpol); + return 0; out: - spin_unlock(&sbinfo->stat_lock); - return error; + raw_spin_unlock(&sbinfo->stat_lock); + return invalfc(fc, "%s", err); } static int shmem_show_options(struct seq_file *seq, struct dentry *root) { struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); + struct mempolicy *mpol; if (sbinfo->max_blocks != shmem_default_max_blocks()) - seq_printf(seq, ",size=%luk", - sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); + seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); - if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) + if (sbinfo->mode != (0777 | S_ISVTX)) seq_printf(seq, ",mode=%03ho", sbinfo->mode); if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) seq_printf(seq, ",uid=%u", @@ -2576,36 +4911,101 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); - shmem_show_mpol(seq, sbinfo->mpol); + + /* + * Showing inode{64,32} might be useful even if it's the system default, + * since then people don't have to resort to checking both here and + * /proc/config.gz to confirm 64-bit inums were successfully applied + * (which may not even exist if IKCONFIG_PROC isn't enabled). + * + * We hide it when inode64 isn't the default and we are using 32-bit + * inodes, since that probably just means the feature isn't even under + * consideration. + * + * As such: + * + * +-----------------+-----------------+ + * | TMPFS_INODE64=y | TMPFS_INODE64=n | + * +------------------+-----------------+-----------------+ + * | full_inums=true | show | show | + * | full_inums=false | show | hide | + * +------------------+-----------------+-----------------+ + * + */ + if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) + seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ + if (sbinfo->huge) + seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); +#endif + mpol = shmem_get_sbmpol(sbinfo); + shmem_show_mpol(seq, mpol); + mpol_put(mpol); + if (sbinfo->noswap) + seq_printf(seq, ",noswap"); +#ifdef CONFIG_TMPFS_QUOTA + if (sb_has_quota_active(root->d_sb, USRQUOTA)) + seq_printf(seq, ",usrquota"); + if (sb_has_quota_active(root->d_sb, GRPQUOTA)) + seq_printf(seq, ",grpquota"); + if (sbinfo->qlimits.usrquota_bhardlimit) + seq_printf(seq, ",usrquota_block_hardlimit=%lld", + sbinfo->qlimits.usrquota_bhardlimit); + if (sbinfo->qlimits.grpquota_bhardlimit) + seq_printf(seq, ",grpquota_block_hardlimit=%lld", + sbinfo->qlimits.grpquota_bhardlimit); + if (sbinfo->qlimits.usrquota_ihardlimit) + seq_printf(seq, ",usrquota_inode_hardlimit=%lld", + sbinfo->qlimits.usrquota_ihardlimit); + if (sbinfo->qlimits.grpquota_ihardlimit) + seq_printf(seq, ",grpquota_inode_hardlimit=%lld", + sbinfo->qlimits.grpquota_ihardlimit); +#endif return 0; } + #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); +#if IS_ENABLED(CONFIG_UNICODE) + if (sb->s_encoding) + utf8_unload(sb->s_encoding); +#endif + +#ifdef CONFIG_TMPFS_QUOTA + shmem_disable_quotas(sb); +#endif + free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); sb->s_fs_info = NULL; } -int shmem_fill_super(struct super_block *sb, void *data, int silent) +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS) +static const struct dentry_operations shmem_ci_dentry_ops = { + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, +}; +#endif + +static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { + struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; - int err = -ENOMEM; + int error = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) - return -ENOMEM; + return error; - sbinfo->mode = S_IRWXUGO | S_ISVTX; - sbinfo->uid = current_fsuid(); - sbinfo->gid = current_fsgid(); sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS @@ -2614,28 +5014,69 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) * tmpfs instance, limiting inodes to one per page of lowmem; * but the internal instance is left unlimited. */ - if (!(sb->s_flags & MS_NOUSER)) { - sbinfo->max_blocks = shmem_default_max_blocks(); - sbinfo->max_inodes = shmem_default_max_inodes(); - if (shmem_parse_options(data, sbinfo, false)) { - err = -EINVAL; - goto failed; - } + if (!(sb->s_flags & SB_KERNMOUNT)) { + if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) + ctx->blocks = shmem_default_max_blocks(); + if (!(ctx->seen & SHMEM_SEEN_INODES)) + ctx->inodes = shmem_default_max_inodes(); + if (!(ctx->seen & SHMEM_SEEN_INUMS)) + ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); + sbinfo->noswap = ctx->noswap; + } else { + sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= MS_NOSEC; + sb->s_flags |= SB_NOSEC; + +#if IS_ENABLED(CONFIG_UNICODE) + if (!ctx->encoding && ctx->strict_encoding) { + pr_err("tmpfs: strict_encoding option without encoding is forbidden\n"); + error = -EINVAL; + goto failed; + } + + if (ctx->encoding) { + sb->s_encoding = ctx->encoding; + set_default_d_op(sb, &shmem_ci_dentry_ops); + if (ctx->strict_encoding) + sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; + } +#endif + #else - sb->s_flags |= MS_NOUSER; + sb->s_flags |= SB_NOUSER; +#endif /* CONFIG_TMPFS */ + sb->s_d_flags |= DCACHE_DONTCACHE; + sbinfo->max_blocks = ctx->blocks; + sbinfo->max_inodes = ctx->inodes; + sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; + if (sb->s_flags & SB_KERNMOUNT) { + sbinfo->ino_batch = alloc_percpu(ino_t); + if (!sbinfo->ino_batch) + goto failed; + } + sbinfo->uid = ctx->uid; + sbinfo->gid = ctx->gid; + sbinfo->full_inums = ctx->full_inums; + sbinfo->mode = ctx->mode; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + else + sbinfo->huge = tmpfs_huge; #endif + sbinfo->mpol = ctx->mpol; + ctx->mpol = NULL; - spin_lock_init(&sbinfo->stat_lock); - if (percpu_counter_init(&sbinfo->used_blocks, 0)) + raw_spin_lock_init(&sbinfo->stat_lock); + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; - sbinfo->free_inodes = sbinfo->max_inodes; + spin_lock_init(&sbinfo->shrinklist_lock); + INIT_LIST_HEAD(&sbinfo->shrinklist); sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; @@ -2643,12 +5084,33 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = shmem_xattr_handlers; #endif #ifdef CONFIG_TMPFS_POSIX_ACL - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif + uuid_t uuid; + uuid_gen(&uuid); + super_set_uuid(sb, uuid.b, sizeof(uuid)); + +#ifdef CONFIG_TMPFS_QUOTA + if (ctx->seen & SHMEM_SEEN_QUOTA) { + sb->dq_op = &shmem_quota_operations; + sb->s_qcop = &dquot_quotactl_sysfile_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; - inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); - if (!inode) + /* Copy the default limits from ctx into sbinfo */ + memcpy(&sbinfo->qlimits, &ctx->qlimits, + sizeof(struct shmem_quota_limits)); + + if (shmem_enable_quotas(sb, ctx->quota_types)) + goto failed; + } +#endif /* CONFIG_TMPFS_QUOTA */ + + inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, + S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); goto failed; + } inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; sb->s_root = d_make_root(inode); @@ -2658,23 +5120,49 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) failed: shmem_put_super(sb); - return err; + return error; +} + +static int shmem_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, shmem_fill_super); } -static struct kmem_cache *shmem_inode_cachep; +static void shmem_free_fc(struct fs_context *fc) +{ + struct shmem_options *ctx = fc->fs_private; + + if (ctx) { + mpol_put(ctx->mpol); + kfree(ctx); + } +} + +static const struct fs_context_operations shmem_fs_context_ops = { + .free = shmem_free_fc, + .get_tree = shmem_get_tree, +#ifdef CONFIG_TMPFS + .parse_monolithic = shmem_parse_monolithic, + .parse_param = shmem_parse_one, + .reconfigure = shmem_reconfigure, +#endif +}; + +static struct kmem_cache *shmem_inode_cachep __ro_after_init; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *info; - info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); + info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; } -static void shmem_destroy_callback(struct rcu_head *head) +static void shmem_free_in_core_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } @@ -2682,7 +5170,8 @@ static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); - call_rcu(&inode->i_rcu, shmem_destroy_callback); + if (S_ISDIR(inode->i_mode)) + simple_offset_destroy(shmem_get_offset_ctx(inode)); } static void shmem_init_inode(void *foo) @@ -2691,57 +5180,66 @@ static void shmem_init_inode(void *foo) inode_init_once(&info->vfs_inode); } -static int shmem_init_inodecache(void) +static void __init shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, SLAB_PANIC, shmem_init_inode); - return 0; + 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); } -static void shmem_destroy_inodecache(void) +static void __init shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_folio(struct address_space *mapping, + struct folio *folio) +{ + return 0; +} + static const struct address_space_operations shmem_aops = { - .writepage = shmem_writepage, - .set_page_dirty = __set_page_dirty_no_writeback, + .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif - .migratepage = migrate_page, - .error_remove_page = generic_error_remove_page, +#ifdef CONFIG_MIGRATION + .migrate_folio = migrate_folio, +#endif + .error_remove_folio = shmem_error_remove_folio, }; static const struct file_operations shmem_file_operations = { - .mmap = shmem_mmap, + .mmap_prepare = shmem_mmap_prepare, + .open = shmem_file_open, + .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = shmem_file_aio_read, - .aio_write = generic_file_aio_write, + .read_iter = shmem_file_read_iter, + .write_iter = shmem_file_write_iter, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, #endif }; static const struct inode_operations shmem_inode_operations = { + .getattr = shmem_getattr, .setattr = shmem_setattr, #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .set_acl = simple_set_acl, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif }; static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS + .getattr = shmem_getattr, .create = shmem_create, .lookup = simple_lookup, .link = shmem_link, @@ -2750,106 +5248,493 @@ static const struct inode_operations shmem_dir_inode_operations = { .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, - .rename = shmem_rename, + .rename = shmem_rename2, .tmpfile = shmem_tmpfile, + .get_offset_ctx = shmem_get_offset_ctx, #endif #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, + .set_acl = simple_set_acl, #endif }; static const struct inode_operations shmem_special_inode_operations = { + .getattr = shmem_getattr, #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, + .set_acl = simple_set_acl, #endif }; static const struct super_operations shmem_ops = { .alloc_inode = shmem_alloc_inode, + .free_inode = shmem_free_in_core_inode, .destroy_inode = shmem_destroy_inode, #ifdef CONFIG_TMPFS .statfs = shmem_statfs, - .remount_fs = shmem_remount_fs, .show_options = shmem_show_options, #endif +#ifdef CONFIG_TMPFS_QUOTA + .get_dquots = shmem_get_dquots, +#endif .evict_inode = shmem_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = shmem_put_super, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + .nr_cached_objects = shmem_unused_huge_count, + .free_cached_objects = shmem_unused_huge_scan, +#endif }; static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, + .map_pages = filemap_map_pages, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif +}; + +static const struct vm_operations_struct shmem_anon_vm_ops = { + .fault = shmem_fault, + .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif - .remap_pages = generic_file_remap_pages, }; -static struct dentry *shmem_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +int shmem_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, shmem_fill_super); + struct shmem_options *ctx; + + ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->mode = 0777 | S_ISVTX; + ctx->uid = current_fsuid(); + ctx->gid = current_fsgid(); + +#if IS_ENABLED(CONFIG_UNICODE) + ctx->encoding = NULL; +#endif + + fc->fs_private = ctx; + fc->ops = &shmem_fs_context_ops; +#ifdef CONFIG_TMPFS + fc->sb_flags |= SB_I_VERSION; +#endif + return 0; } static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", - .mount = shmem_mount, - .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, + .init_fs_context = shmem_init_fs_context, +#ifdef CONFIG_TMPFS + .parameters = shmem_fs_parameters, +#endif + .kill_sb = kill_anon_super, + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, +}; + +#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define TMPFS_ATTR_W(_name, _store) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + +#define TMPFS_ATTR_RW(_name, _show, _store) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define TMPFS_ATTR_RO(_name, _show) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#if IS_ENABLED(CONFIG_UNICODE) +static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a, + char *buf) +{ + return sysfs_emit(buf, "supported\n"); +} +TMPFS_ATTR_RO(casefold, casefold_show); +#endif + +static struct attribute *tmpfs_attributes[] = { +#if IS_ENABLED(CONFIG_UNICODE) + &tmpfs_attr_casefold.attr, +#endif + NULL }; -int __init shmem_init(void) +static const struct attribute_group tmpfs_attribute_group = { + .attrs = tmpfs_attributes, + .name = "features" +}; + +static struct kobject *tmpfs_kobj; + +static int __init tmpfs_sysfs_init(void) +{ + int ret; + + tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj); + if (!tmpfs_kobj) + return -ENOMEM; + + ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group); + if (ret) + kobject_put(tmpfs_kobj); + + return ret; +} +#endif /* CONFIG_SYSFS && CONFIG_TMPFS */ + +void __init shmem_init(void) { int error; - error = bdi_init(&shmem_backing_dev_info); - if (error) - goto out4; + shmem_init_inodecache(); - error = shmem_init_inodecache(); - if (error) - goto out3; +#ifdef CONFIG_TMPFS_QUOTA + register_quota_format(&shmem_quota_format); +#endif error = register_filesystem(&shmem_fs_type); if (error) { - printk(KERN_ERR "Could not register tmpfs\n"); + pr_err("Could not register tmpfs\n"); goto out2; } - shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, - shmem_fs_type.name, NULL); + shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); - printk(KERN_ERR "Could not kern_mount tmpfs\n"); + pr_err("Could not kern_mount tmpfs\n"); goto out1; } - return 0; + +#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) + error = tmpfs_sysfs_init(); + if (error) { + pr_err("Could not init tmpfs sysfs\n"); + goto out1; + } +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) + SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; + else + shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ + + /* + * Default to setting PMD-sized THP to inherit the global setting and + * disable all other multi-size THPs. + */ + if (!shmem_orders_configured) + huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); +#endif + return; out1: unregister_filesystem(&shmem_fs_type); out2: +#ifdef CONFIG_TMPFS_QUOTA + unregister_quota_format(&shmem_quota_format); +#endif shmem_destroy_inodecache(); -out3: - bdi_destroy(&shmem_backing_dev_info); -out4: shm_mnt = ERR_PTR(error); - return error; } +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) +static ssize_t shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + static const int values[] = { + SHMEM_HUGE_ALWAYS, + SHMEM_HUGE_WITHIN_SIZE, + SHMEM_HUGE_ADVISE, + SHMEM_HUGE_NEVER, + SHMEM_HUGE_DENY, + SHMEM_HUGE_FORCE, + }; + int len = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(values); i++) { + len += sysfs_emit_at(buf, len, + shmem_huge == values[i] ? "%s[%s]" : "%s%s", + i ? " " : "", shmem_format_huge(values[i])); + } + len += sysfs_emit_at(buf, len, "\n"); + + return len; +} + +static ssize_t shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + char tmp[16]; + int huge, err; + + if (count + 1 > sizeof(tmp)) + return -EINVAL; + memcpy(tmp, buf, count); + tmp[count] = '\0'; + if (count && tmp[count - 1] == '\n') + tmp[count - 1] = '\0'; + + huge = shmem_parse_huge(tmp); + if (huge == -EINVAL) + return huge; + + shmem_huge = huge; + if (shmem_huge > SHMEM_HUGE_DENY) + SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; + + err = start_stop_khugepaged(); + return err ? err : count; +} + +struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); +static DEFINE_SPINLOCK(huge_shmem_orders_lock); + +static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_shmem_orders_always)) + output = "[always] inherit within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_inherit)) + output = "always [inherit] within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_within_size)) + output = "always inherit [within_size] advise never"; + else if (test_bit(order, &huge_shmem_orders_madvise)) + output = "always inherit within_size [advise] never"; + else + output = "always inherit within_size advise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_always); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "inherit")) { + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (shmem_huge == SHMEM_HUGE_FORCE && + order != HPAGE_PMD_ORDER) + return -EINVAL; + + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_inherit); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "within_size")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + set_bit(order, &huge_shmem_orders_within_size); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "advise")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + clear_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else { + ret = -EINVAL; + } + + if (ret > 0) { + int err = start_stop_khugepaged(); + + if (err) + ret = err; + } + return ret; +} + +struct kobj_attribute thpsize_shmem_enabled_attr = + __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) + +static int __init setup_transparent_hugepage_shmem(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge == -EINVAL) { + pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n"); + return huge; + } + + shmem_huge = huge; + return 1; +} +__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); + +static int __init setup_transparent_hugepage_tmpfs(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge < 0) { + pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); + return huge; + } + + tmpfs_huge = huge; + return 1; +} +__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); + +static char str_dup[PAGE_SIZE] __initdata; +static int __init setup_thp_shmem(char *str) +{ + char *token, *range, *policy, *subtoken; + unsigned long always, inherit, madvise, within_size; + char *start_size, *end_size; + int start, end, nr; + char *p; + + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strscpy(str_dup, str); + + always = huge_shmem_orders_always; + inherit = huge_shmem_orders_inherit; + madvise = huge_shmem_orders_madvise; + within_size = huge_shmem_orders_within_size; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; + + if (!policy) + goto err; + + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; + + start = get_order_from_str(start_size, + THP_ORDERS_ALL_FILE_DEFAULT); + end = get_order_from_str(end_size, + THP_ORDERS_ALL_FILE_DEFAULT); + } else { + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_FILE_DEFAULT); + } + + if (start < 0) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + start_size); + goto err; + } + + if (end < 0) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + end_size); + goto err; + } + + if (start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "advise")) { + bitmap_set(&madvise, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "inherit")) { + bitmap_set(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "within_size")) { + bitmap_set(&within_size, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + } else if (!strcmp(policy, "never")) { + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else { + pr_err("invalid policy %s in thp_shmem boot parameter\n", policy); + goto err; + } + } + } + + huge_shmem_orders_always = always; + huge_shmem_orders_madvise = madvise; + huge_shmem_orders_inherit = inherit; + huge_shmem_orders_within_size = within_size; + shmem_orders_configured = true; + return 1; + +err: + pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str); + return 0; +} +__setup("thp_shmem=", setup_thp_shmem); + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #else /* !CONFIG_SHMEM */ /* @@ -2863,27 +5748,26 @@ out4: static struct file_system_type shmem_fs_type = { .name = "tmpfs", - .mount = ramfs_mount, - .kill_sb = kill_litter_super, + .init_fs_context = ramfs_init_fs_context, + .parameters = ramfs_fs_parameters, + .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; -int __init shmem_init(void) +void __init shmem_init(void) { BUG_ON(register_filesystem(&shmem_fs_type) != 0); shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); - - return 0; } -int shmem_unuse(swp_entry_t swap, struct page *page) +int shmem_unuse(unsigned int type) { return 0; } -int shmem_lock(struct file *file, int lock, struct user_struct *user) +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { return 0; } @@ -2892,148 +5776,225 @@ void shmem_unlock_mapping(struct address_space *mapping) { } -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +#ifdef CONFIG_MMU +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return mm_get_unmapped_area(file, addr, len, pgoff, flags); +} +#endif + +void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops +#define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations -#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) -#endif /* CONFIG_SHMEM */ - -/* common code */ - -static char *shmem_dname(struct dentry *dentry, char *buffer, int buflen) +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) { - return dynamic_dname(dentry, buffer, buflen, "/%s (deleted)", - dentry->d_name.name); + struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); + return inode ? inode : ERR_PTR(-ENOSPC); } -static struct dentry_operations anon_ops = { - .d_dname = shmem_dname -}; +#endif /* CONFIG_SHMEM */ -/** - * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc/<pid>/maps - * @size: size to be set for the file - * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size - */ -struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +/* common code */ + +static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, + loff_t size, unsigned long vm_flags, + unsigned int i_flags) { - struct file *res; + unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0; struct inode *inode; - struct path path; - struct super_block *sb; - struct qstr this; + struct file *res; - if (IS_ERR(shm_mnt)) - return ERR_CAST(shm_mnt); + if (IS_ERR(mnt)) + return ERR_CAST(mnt); if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); + if (is_idmapped_mnt(mnt)) + return ERR_PTR(-EINVAL); + if (shmem_acct_size(flags, size)) return ERR_PTR(-ENOMEM); - res = ERR_PTR(-ENOMEM); - this.name = name; - this.len = strlen(name); - this.hash = 0; /* will go */ - sb = shm_mnt->mnt_sb; - path.dentry = d_alloc_pseudo(sb, &this); - if (!path.dentry) - goto put_memory; - d_set_d_op(path.dentry, &anon_ops); - path.mnt = mntget(shm_mnt); - - res = ERR_PTR(-ENOSPC); - inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); - if (!inode) - goto put_dentry; - - d_instantiate(path.dentry, inode); + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, + S_IFREG | S_IRWXUGO, 0, vm_flags); + if (IS_ERR(inode)) { + shmem_unacct_size(flags, size); + return ERR_CAST(inode); + } + inode->i_flags |= i_flags; inode->i_size = size; clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); + if (!IS_ERR(res)) + res = alloc_file_pseudo(inode, mnt, name, O_RDWR, + &shmem_file_operations); if (IS_ERR(res)) - goto put_dentry; - - res = alloc_file(&path, FMODE_WRITE | FMODE_READ, - &shmem_file_operations); - if (IS_ERR(res)) - goto put_dentry; - + iput(inode); return res; +} -put_dentry: - path_put(&path); -put_memory: - shmem_unacct_size(flags, size); - return res; +/** + * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be + * kernel internal. There will be NO LSM permission checks against the + * underlying inode. So users of this interface must do LSM checks at a + * higher layer. The users are the big_key and shm implementations. LSM + * checks are provided at the key or shm level rather than the inode. + * @name: name for dentry (to be seen in /proc/<pid>/maps) + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); +} +EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); + +/** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc/<pid>/maps) + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(shm_mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup); /** + * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs + * @mnt: the tmpfs mount where the file will be created + * @name: name for dentry (to be seen in /proc/<pid>/maps) + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, + loff_t size, unsigned long flags) +{ + return __shmem_file_setup(mnt, name, size, flags, 0); +} +EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); + +static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags) +{ + loff_t size = end - start; + + /* + * Cloning a new file under mmap_lock leads to a lock ordering conflict + * between XFS directory reading and selinux: since this file is only + * accessible to the user through its mapping, use S_PRIVATE flag to + * bypass file security, in the same way as shmem_kernel_file_setup(). + */ + return shmem_kernel_file_setup("dev/zero", size, vm_flags); +} + +/** * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap_pgoff + * @vma: the vma to be mmapped is prepared by do_mmap + * Returns: 0 on success, or error */ int shmem_zero_setup(struct vm_area_struct *vma) { - struct file *file; - loff_t size = vma->vm_end - vma->vm_start; + struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags); - file = shmem_file_setup("dev/zero", size, vma->vm_flags); if (IS_ERR(file)) return PTR_ERR(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; - vma->vm_ops = &shmem_vm_ops; + vma->vm_ops = &shmem_anon_vm_ops; + return 0; } /** - * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. - * @mapping: the page's address_space - * @index: the page index + * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA + * descriptor for convenience. + * @desc: Describes VMA + * Returns: 0 on success, or error + */ +int shmem_zero_setup_desc(struct vm_area_desc *desc) +{ + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags); + + if (IS_ERR(file)) + return PTR_ERR(file); + + desc->vm_file = file; + desc->vm_ops = &shmem_anon_vm_ops; + + return 0; +} + +/** + * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. + * @mapping: the folio's address_space + * @index: the folio index * @gfp: the page allocator flags to use if allocating * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", * with any new page allocations done using the specified allocation flags. - * But read_cache_page_gfp() uses the ->readpage() method: which does not + * But read_cache_page_gfp() uses the ->read_folio() method: which does not * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ -struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, - pgoff_t index, gfp_t gfp) +struct folio *shmem_read_folio_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; - struct page *page; + struct folio *folio; int error; - BUG_ON(mapping->a_ops != &shmem_aops); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + error = shmem_get_folio_gfp(inode, index, i_size_read(inode), + &folio, SGP_CACHE, gfp, NULL, NULL); if (error) - page = ERR_PTR(error); - else - unlock_page(page); - return page; + return ERR_PTR(error); + + folio_unlock(folio); + return folio; #else /* * The tiny !SHMEM case uses ramfs without swap */ - return read_cache_page_gfp(mapping, index, gfp); + return mapping_read_folio_gfp(mapping, index, gfp); #endif } +EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); + +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); + struct page *page; + + if (IS_ERR(folio)) + return &folio->page; + + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { + folio_put(folio); + return ERR_PTR(-EIO); + } + + return page; +} EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); |
