diff options
Diffstat (limited to 'mm/shmem.c')
-rw-r--r-- | mm/shmem.c | 182 |
1 files changed, 108 insertions, 74 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 99327c30507c..7fdd707ac1ac 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -98,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ @@ -107,7 +107,7 @@ struct shmem_falloc { pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ - pgoff_t nr_unswapped; /* how often writepage refused to swap out */ + pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { @@ -292,7 +292,7 @@ bool vma_is_shmem(struct vm_area_struct *vma) } static LIST_HEAD(shmem_swaplist); -static DEFINE_MUTEX(shmem_swaplist_mutex); +static DEFINE_SPINLOCK(shmem_swaplist_lock); #ifdef CONFIG_TMPFS_QUOTA @@ -432,10 +432,13 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) + * + * Return: true if swapped was incremented from 0, for shmem_writeout(). */ -static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) +static bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); + bool first_swapped = false; long freed; spin_lock(&info->lock); @@ -446,12 +449,15 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), - * shmem_writepage() has to raise swapped before nrpages is lowered - + * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ - if (swapped > 0) + if (swapped > 0) { + if (info->swapped == swapped) + first_swapped = true; freed += swapped; + } if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); @@ -459,6 +465,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* The quota case may block */ if (freed > 0) shmem_inode_unacct_blocks(inode, freed); + return first_swapped; } bool shmem_charge(struct inode *inode, long pages) @@ -615,7 +622,7 @@ static unsigned int shmem_get_orders_within_size(struct inode *inode, static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); @@ -862,7 +869,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { return 0; } @@ -1375,11 +1382,11 @@ static void shmem_evict_inode(struct inode *inode) /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); } } @@ -1446,8 +1453,6 @@ static int shmem_unuse_swap_entries(struct inode *inode, for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - if (!xa_is_value(folio)) - continue; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { @@ -1504,7 +1509,8 @@ int shmem_unuse(unsigned int type) if (list_empty(&shmem_swaplist)) return 0; - mutex_lock(&shmem_swaplist_mutex); + spin_lock(&shmem_swaplist_lock); +start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); @@ -1517,31 +1523,38 @@ int shmem_unuse(unsigned int type) * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); - mutex_lock(&shmem_swaplist_mutex); - next = list_next_entry(info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); + spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; + if (list_empty(&info->swaplist)) + goto start_over; + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); } - mutex_unlock(&shmem_swaplist_mutex); + spin_unlock(&shmem_swaplist_lock); return error; } -/* - * Move the page from the page cache to the swap cache. +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @plug: swap plug + * @folio_list: list to put back folios on split + * + * Move the folio from the page cache to the swap cache. */ -static int shmem_writepage(struct page *page, struct writeback_control *wbc) +int shmem_writeout(struct folio *folio, struct swap_iocb **plug, + struct list_head *folio_list) { - struct folio *folio = page_folio(page); struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1550,16 +1563,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) int nr_pages; bool split = false; - /* - * Our capabilities prevent regular writeback or sync from ever calling - * shmem_writepage; but a stacking filesystem might use ->writepage of - * its underlying filesystem, in which case tmpfs should write out to - * swap only in response to memory pressure, and not for the writeback - * threads or sync. - */ - if (WARN_ON_ONCE(!wbc->for_reclaim)) - goto redirty; - if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; @@ -1586,9 +1589,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page_to_list_to_order(page, wbc->list, 0)) + if (split_folio_to_list(folio, folio_list)) goto redirty; - folio = page_folio(page); folio_clear_dirty(folio); } @@ -1627,39 +1629,66 @@ try_split: folio_mark_uptodate(folio); } - /* - * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the folio is - * moved to swap cache, when its pagelock no longer protects - * the inode from eviction. But don't unlock the mutex until - * we've incremented swapped, because shmem_unuse_inode() will - * prune a !swapped inode from the swaplist under this mutex. - */ - mutex_lock(&shmem_swaplist_mutex); - if (list_empty(&info->swaplist)) - list_add(&info->swaplist, &shmem_swaplist); - if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { - shmem_recalc_inode(inode, 0, nr_pages); + bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); + int error; + + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now before the folio is + * removed from page cache, when its pagelock no longer + * protects the inode from eviction. And do it now, after + * we've incremented swapped, because shmem_unuse() will + * prune a !swapped inode from the swaplist. + */ + if (first_swapped) { + spin_lock(&shmem_swaplist_lock); + if (list_empty(&info->swaplist)) + list_add(&info->swaplist, &shmem_swaplist); + spin_unlock(&shmem_swaplist_lock); + } + swap_shmem_alloc(folio->swap, nr_pages); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); - mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - return swap_writepage(&folio->page, wbc); - } + error = swap_writeout(folio, plug); + if (error != AOP_WRITEPAGE_ACTIVATE) { + /* folio has been unlocked */ + return error; + } + + /* + * The intention here is to avoid holding on to the swap when + * zswap was unable to compress and unable to writeback; but + * it will be appropriate if other reactivate cases are added. + */ + error = shmem_add_to_page_cache(folio, mapping, index, + swp_to_radix_entry(folio->swap), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + /* Swap entry might be erased by racing shmem_free_swap() */ + if (!error) { + shmem_recalc_inode(inode, 0, -nr_pages); + swap_free_nr(folio->swap, nr_pages); + } - list_del_init(&info->swaplist); - mutex_unlock(&shmem_swaplist_mutex); + /* + * The delete_from_swap_cache() below could be left for + * shrink_folio_list()'s folio_free_swap() to dispose of; + * but I'm a little nervous about letting this folio out of + * shmem_writeout() in a hybrid half-tmpfs-half-swap state + * e.g. folio_mapping(folio) might give an unexpected answer. + */ + delete_from_swap_cache(folio); + goto redirty; + } if (nr_pages > 1) goto try_split; redirty: folio_mark_dirty(folio); - if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ - folio_unlock(folio); - return 0; + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } +EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) @@ -1760,7 +1789,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); - unsigned long vm_flags = vma ? vma->vm_flags : 0; + vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) @@ -2262,6 +2291,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio = swap_cache_get_folio(swap, NULL, 0); order = xa_get_order(&mapping->i_pages, index); if (!folio) { + int nr_pages = 1 << order; bool fallback_order0 = false; /* Or update major stats only when swapin succeeds?? */ @@ -2275,9 +2305,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. */ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled())) + !zswap_never_enabled() || + non_swapcache_batch(swap, nr_pages) != nr_pages)) fallback_order0 = true; /* Skip swapcache for synchronous device. */ @@ -2335,6 +2368,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, */ split_order = shmem_split_large_entry(inode, index, swap, gfp); if (split_order < 0) { + folio_put(folio); + folio = NULL; error = split_order; goto failed; } @@ -3267,9 +3302,9 @@ static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int -shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) +shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -3301,9 +3336,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, } static int -shmem_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) +shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; @@ -3768,7 +3803,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, index--; /* - * Inform shmem_writepage() how far we have reached. + * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) @@ -4184,7 +4219,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, #ifdef CONFIG_TMPFS_XATTR -static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); @@ -4194,7 +4229,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) } static int shmem_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); @@ -4981,7 +5016,6 @@ static void shmem_put_super(struct super_block *sb) static const struct dentry_operations shmem_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, - .d_delete = always_delete_dentry, }; #endif @@ -5029,7 +5063,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) if (ctx->encoding) { sb->s_encoding = ctx->encoding; - sb->s_d_op = &shmem_ci_dentry_ops; + set_default_d_op(sb, &shmem_ci_dentry_ops); if (ctx->strict_encoding) sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; } @@ -5038,6 +5072,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #else sb->s_flags |= SB_NOUSER; #endif /* CONFIG_TMPFS */ + sb->s_d_flags |= DCACHE_DONTCACHE; sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; @@ -5191,7 +5226,6 @@ static int shmem_error_remove_folio(struct address_space *mapping, } static const struct address_space_operations shmem_aops = { - .writepage = shmem_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, @@ -5810,12 +5844,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); - if (shmem_acct_size(flags, size)) - return ERR_PTR(-ENOMEM); - if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { |