diff options
Diffstat (limited to 'mm/shmem.c')
-rw-r--r-- | mm/shmem.c | 1775 |
1 files changed, 1370 insertions, 405 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 0aad0d9a621b..3a5a65b1f41a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -40,6 +40,7 @@ #include <linux/fs_parser.h> #include <linux/swapfile.h> #include <linux/iversion.h> +#include <linux/unicode.h> #include "swap.h" static struct vfsmount *shm_mnt __ro_after_init; @@ -85,7 +86,6 @@ static struct vfsmount *shm_mnt __ro_after_init; #include "internal.h" -#define BLOCKS_PER_PAGE (PAGE_SIZE/512) #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ @@ -98,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init; #define SHORT_SYMLINK_LEN 128 /* - * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ @@ -107,7 +107,7 @@ struct shmem_falloc { pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ - pgoff_t nr_unswapped; /* how often writepage refused to swap out */ + pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { @@ -123,6 +123,10 @@ struct shmem_options { bool noswap; unsigned short quota_types; struct shmem_quota_limits qlimits; +#if IS_ENABLED(CONFIG_UNICODE) + struct unicode_map *encoding; + bool strict_encoding; +#endif #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 @@ -131,6 +135,14 @@ struct shmem_options { #define SHMEM_SEEN_QUOTA 32 }; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long huge_shmem_orders_always __read_mostly; +static unsigned long huge_shmem_orders_madvise __read_mostly; +static unsigned long huge_shmem_orders_inherit __read_mostly; +static unsigned long huge_shmem_orders_within_size __read_mostly; +static bool shmem_orders_configured __initdata; +#endif + #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -148,7 +160,7 @@ static unsigned long shmem_default_max_inodes(void) static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct mm_struct *fault_mm, vm_fault_t *fault_type); + struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -434,7 +446,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), - * shmem_writepage() has to raise swapped before nrpages is lowered - + * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ @@ -495,8 +507,8 @@ static int shmem_replace_entry(struct address_space *mapping, * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back from swap by a racing thread. * - * Checking page is not enough: by the time a SwapCache page is locked, it - * might be reused, and again be SwapCache, using the same swap as before. + * Checking folio is not enough: by the time a swapcache folio is locked, it + * might be reused, and again be swapcache, using the same swap as before. */ static bool shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) @@ -513,9 +525,9 @@ static bool shmem_confirm_swap(struct address_space *mapping, * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, - * also respect fadvise()/madvise() hints; + * also respect madvise() hints; * SHMEM_HUGE_ADVISE: - * only allocate huge pages if requested with fadvise()/madvise(); + * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 @@ -540,57 +552,155 @@ static bool shmem_confirm_swap(struct address_space *mapping, /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; +static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; + +/** + * shmem_mapping_size_orders - Get allowable folio orders for the given file size. + * @mapping: Target address_space. + * @index: The page index. + * @write_end: end of a write, could extend inode size. + * + * This returns huge orders for folios (when supported) based on the file size + * which the mapping currently allows at the given index. The index is relevant + * due to alignment considerations the mapping might have. The returned order + * may be less than the size passed. + * + * Return: The orders. + */ +static inline unsigned int +shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end) +{ + unsigned int order; + size_t size; + + if (!mapping_large_folio_support(mapping) || !write_end) + return 0; + + /* Calculate the write size based on the write_end */ + size = write_end - (index << PAGE_SHIFT); + order = filemap_get_order(size); + if (!order) + return 0; + + /* If we're not aligned, allocate a smaller folio */ + if (index & ((1UL << order) - 1)) + order = __ffs(index); + + order = min_t(size_t, order, MAX_PAGECACHE_ORDER); + return order > 0 ? BIT(order + 1) - 1 : 0; +} -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) +static unsigned int shmem_get_orders_within_size(struct inode *inode, + unsigned long within_size_orders, pgoff_t index, + loff_t write_end) { + pgoff_t aligned_index; + unsigned long order; loff_t i_size; + order = highest_order(within_size_orders); + while (within_size_orders) { + aligned_index = round_up(index + 1, 1 << order); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= aligned_index) + return within_size_orders; + + order = next_order(&within_size_orders, order); + } + + return 0; +} + +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) +{ + unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? + 0 : BIT(HPAGE_PMD_ORDER); + unsigned long within_size_orders; + if (!S_ISREG(inode->i_mode)) - return false; - if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) - return false; + return 0; if (shmem_huge == SHMEM_HUGE_DENY) - return false; + return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) - return true; + return maybe_pmd_order; + /* + * The huge order allocation for anon shmem is controlled through + * the mTHP interface, so we still use PMD-sized huge order to + * check whether global control is enabled. + * + * For tmpfs mmap()'s huge order, we still use PMD-sized order to + * allocate huge pages due to lack of a write size hint. + * + * Otherwise, tmpfs will allow getting a highest order hint based on + * the size of write and fallocate paths, then will try each allowable + * huge orders. + */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: - return true; + if (vma) + return maybe_pmd_order; + + return shmem_mapping_size_orders(inode->i_mapping, index, write_end); case SHMEM_HUGE_WITHIN_SIZE: - index = round_up(index + 1, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= index) - return true; + if (vma) + within_size_orders = maybe_pmd_order; + else + within_size_orders = shmem_mapping_size_orders(inode->i_mapping, + index, write_end); + + within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, + index, write_end); + if (within_size_orders > 0) + return within_size_orders; + fallthrough; case SHMEM_HUGE_ADVISE: - if (mm && (vm_flags & VM_HUGEPAGE)) - return true; + if (vm_flags & VM_HUGEPAGE) + return maybe_pmd_order; fallthrough; default: - return false; + return 0; } } -#if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { + int huge; + + if (!str) + return -EINVAL; + if (!strcmp(str, "never")) - return SHMEM_HUGE_NEVER; - if (!strcmp(str, "always")) - return SHMEM_HUGE_ALWAYS; - if (!strcmp(str, "within_size")) - return SHMEM_HUGE_WITHIN_SIZE; - if (!strcmp(str, "advise")) - return SHMEM_HUGE_ADVISE; - if (!strcmp(str, "deny")) - return SHMEM_HUGE_DENY; - if (!strcmp(str, "force")) - return SHMEM_HUGE_FORCE; - return -EINVAL; + huge = SHMEM_HUGE_NEVER; + else if (!strcmp(str, "always")) + huge = SHMEM_HUGE_ALWAYS; + else if (!strcmp(str, "within_size")) + huge = SHMEM_HUGE_WITHIN_SIZE; + else if (!strcmp(str, "advise")) + huge = SHMEM_HUGE_ADVISE; + else if (!strcmp(str, "deny")) + huge = SHMEM_HUGE_DENY; + else if (!strcmp(str, "force")) + huge = SHMEM_HUGE_FORCE; + else + return -EINVAL; + + if (!has_transparent_hugepage() && + huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) + return -EINVAL; + + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (huge == SHMEM_HUGE_FORCE && + huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) + return -EINVAL; + + return huge; } -#endif #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) @@ -616,15 +726,14 @@ static const char *shmem_format_huge(int huge) #endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) + struct shrink_control *sc, unsigned long nr_to_free) { LIST_HEAD(list), *pos, *next; - LIST_HEAD(to_remove); struct inode *inode; struct shmem_inode_info *info; struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; - int split = 0; + unsigned long split = 0, freed = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -642,13 +751,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, goto next; } - /* Check if there's anything to gain */ - if (round_up(inode->i_size, PAGE_SIZE) == - round_up(inode->i_size, HPAGE_PMD_SIZE)) { - list_move(&info->shrinklist, &to_remove); - goto next; - } - list_move(&info->shrinklist, &list); next: sbinfo->shrinklist_len--; @@ -657,34 +759,36 @@ next: } spin_unlock(&sbinfo->shrinklist_lock); - list_for_each_safe(pos, next, &to_remove) { - info = list_entry(pos, struct shmem_inode_info, shrinklist); - inode = &info->vfs_inode; - list_del_init(&info->shrinklist); - iput(inode); - } - list_for_each_safe(pos, next, &list) { + pgoff_t next, end; + loff_t i_size; int ret; - pgoff_t index; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; - if (nr_to_split && split >= nr_to_split) + if (nr_to_free && freed >= nr_to_free) goto move_back; - index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; - folio = filemap_get_folio(inode->i_mapping, index); - if (IS_ERR(folio)) + i_size = i_size_read(inode); + folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); + if (!folio || xa_is_value(folio)) goto drop; - /* No huge page at the end of the file: nothing to split */ + /* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) { folio_put(folio); goto drop; } + /* Check if there is anything to gain from splitting */ + next = folio_next_index(folio); + end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); + if (end <= folio->index || end >= next) { + folio_put(folio); + goto drop; + } + /* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. @@ -705,6 +809,7 @@ next: if (ret) goto move_back; + freed += next - end; split++; drop: list_del_init(&info->shrinklist); @@ -748,19 +853,29 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_free) { - return false; + return 0; } -static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) +static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + struct vm_area_struct *vma, + unsigned long vm_flags) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static void shmem_update_stats(struct folio *folio, int nr_pages) +{ + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); +} + /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ @@ -774,7 +889,6 @@ static int shmem_add_to_page_cache(struct folio *folio, VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - VM_BUG_ON(expected && folio_test_large(folio)); folio_ref_add(folio, nr); folio->mapping = mapping; @@ -796,10 +910,7 @@ static int shmem_add_to_page_cache(struct folio *folio, xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); + shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); @@ -827,26 +938,29 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); - folio_put(folio); + folio_put_refs(folio, nr); BUG_ON(error); } /* - * Remove swap entry from page cache, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. Returns + * the number of pages being freed. 0 means entry not found in XArray (0 pages + * being freed). */ -static int shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) +static long shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) { + int order = xa_get_order(&mapping->i_pages, index); void *old; old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); if (old != radswap) - return -ENOENT; - free_swap_and_cache(radix_to_swp_entry(radswap)); - return 0; + return 0; + free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + + return 1 << order; } /* @@ -869,7 +983,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) - swapped++; + swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) { @@ -959,7 +1073,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * (although in some cases this is just a waste of time). */ folio = NULL; - shmem_get_folio(inode, index, &folio, SGP_READ); + shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } @@ -998,7 +1112,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, + nr_swaps_freed += shmem_free_swap(mapping, indices[i], folio); continue; } @@ -1065,14 +1179,17 @@ whole_folios: folio = fbatch.folios[i]; if (xa_is_value(folio)) { + long swaps_freed; + if (unfalloc) continue; - if (shmem_free_swap(mapping, indices[i], folio)) { + swaps_freed = shmem_free_swap(mapping, indices[i], folio); + if (!swaps_freed) { /* Swap was replaced by page: retry */ index = indices[i]; break; } - nr_swaps_freed++; + nr_swaps_freed += swaps_freed; continue; } @@ -1144,7 +1261,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); - if (shmem_is_huge(inode, 0, false, NULL, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1276,9 +1393,9 @@ static void shmem_evict_inode(struct inode *inode) #endif } -static int shmem_find_swap_entries(struct address_space *mapping, - pgoff_t start, struct folio_batch *fbatch, - pgoff_t *indices, unsigned int type) +static unsigned int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, struct folio_batch *fbatch, + pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; @@ -1311,7 +1428,7 @@ static int shmem_find_swap_entries(struct address_space *mapping, } rcu_read_unlock(); - return xas.xa_index; + return folio_batch_count(fbatch); } /* @@ -1329,8 +1446,6 @@ static int shmem_unuse_swap_entries(struct inode *inode, for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; - if (!xa_is_value(folio)) - continue; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { @@ -1358,8 +1473,8 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type) do { folio_batch_init(&fbatch); - shmem_find_swap_entries(mapping, start, &fbatch, indices, type); - if (folio_batch_count(&fbatch) == 0) { + if (!shmem_find_swap_entries(mapping, start, &fbatch, + indices, type)) { ret = 0; break; } @@ -1388,6 +1503,7 @@ int shmem_unuse(unsigned int type) return 0; mutex_lock(&shmem_swaplist_mutex); +start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); @@ -1406,63 +1522,74 @@ int shmem_unuse(unsigned int type) cond_resched(); mutex_lock(&shmem_swaplist_mutex); - next = list_next_entry(info, swaplist); - if (!info->swapped) - list_del_init(&info->swaplist); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; + if (list_empty(&info->swaplist)) + goto start_over; + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); } mutex_unlock(&shmem_swaplist_mutex); return error; } -/* - * Move the page from the page cache to the swap cache. +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @wbc: How writeback is to be done + * + * Move the folio from the page cache to the swap cache. */ -static int shmem_writepage(struct page *page, struct writeback_control *wbc) +int shmem_writeout(struct folio *folio, struct writeback_control *wbc) { - struct folio *folio = page_folio(page); struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - swp_entry_t swap; pgoff_t index; + int nr_pages; + bool split = false; - /* - * Our capabilities prevent regular writeback or sync from ever calling - * shmem_writepage; but a stacking filesystem might use ->writepage of - * its underlying filesystem, in which case tmpfs should write out to - * swap only in response to memory pressure, and not for the writeback - * threads or sync. - */ if (WARN_ON_ONCE(!wbc->for_reclaim)) goto redirty; - if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap)) + if ((info->flags & VM_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) goto redirty; /* - * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or - * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, - * and its shmem_writeback() needs them to be split when swapping. + * If CONFIG_THP_SWAP is not enabled, the large folio should be + * split when swapping. + * + * And shrinkage of pages beyond i_size does not split swap, so + * swapout of a large folio crossing i_size needs to split too + * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { + index = shmem_fallocend(inode, + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); + if ((index > folio->index && index < folio_next_index(folio)) || + !IS_ENABLED(CONFIG_THP_SWAP)) + split = true; + } + + if (split) { +try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page(page) < 0) + if (split_folio_to_list(folio, wbc->list)) goto redirty; - folio = page_folio(page); folio_clear_dirty(folio); } index = folio->index; + nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC @@ -1484,7 +1611,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) - shmem_falloc->nr_unswapped++; + shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); @@ -1496,10 +1623,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) folio_mark_uptodate(folio); } - swap = folio_alloc_swap(folio); - if (!swap.val) - goto redirty; - /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the folio is @@ -1512,20 +1635,20 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(folio, swap, - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, - NULL) == 0) { - shmem_recalc_inode(inode, 0, 1); - swap_shmem_alloc(swap); - shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); + if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) { + shmem_recalc_inode(inode, 0, nr_pages); + swap_shmem_alloc(folio->swap, nr_pages); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); - return swap_writepage(&folio->page, wbc); + return swap_writeout(folio, wbc); } - + if (!info->swapped) + list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); - put_swap_folio(folio, swap); + if (nr_pages > 1) + goto try_split; redirty: folio_mark_dirty(folio); if (wbc->for_reclaim) @@ -1533,6 +1656,7 @@ redirty: folio_unlock(folio); return 0; } +EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) @@ -1609,51 +1733,88 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) return result; } -static struct folio *shmem_alloc_hugefolio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +bool shmem_hpage_pmd_enabled(void) { - struct mempolicy *mpol; - pgoff_t ilx; - struct page *page; - - mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); - page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); - mpol_cond_put(mpol); + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) && + shmem_huge != SHMEM_HUGE_NEVER) + return true; - return page_rmappable_folio(page); + return false; } -static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) +unsigned long shmem_allowable_huge_orders(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + loff_t write_end, bool shmem_huge_force) { - struct mempolicy *mpol; - pgoff_t ilx; - struct page *page; + unsigned long mask = READ_ONCE(huge_shmem_orders_always); + unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); + unsigned long vm_flags = vma ? vma->vm_flags : 0; + unsigned int global_orders; - mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); - page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); - mpol_cond_put(mpol); + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) + return 0; - return (struct folio *)page; -} + global_orders = shmem_huge_global_enabled(inode, index, write_end, + shmem_huge_force, vma, vm_flags); + /* Tmpfs huge pages allocation */ + if (!vma || !vma_is_anon_shmem(vma)) + return global_orders; -static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, - struct inode *inode, pgoff_t index, - struct mm_struct *fault_mm, bool huge) -{ - struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); - struct folio *folio; - long pages; - int error; + /* + * Following the 'deny' semantics of the top level, force the huge + * option off from all mounts. + */ + if (shmem_huge == SHMEM_HUGE_DENY) + return 0; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - huge = false; + /* + * Only allow inherit orders if the top-level value is 'force', which + * means non-PMD sized THP can not override 'huge' mount option now. + */ + if (shmem_huge == SHMEM_HUGE_FORCE) + return READ_ONCE(huge_shmem_orders_inherit); + + /* Allow mTHP that will be fully within i_size. */ + mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_shmem_orders_madvise); - if (huge) { - pages = HPAGE_PMD_NR; - index = round_down(index, HPAGE_PMD_NR); + if (global_orders > 0) + mask |= READ_ONCE(huge_shmem_orders_inherit); + return THP_ORDERS_ALL_FILE_DEFAULT & mask; +} + +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; + pgoff_t aligned_index; + unsigned long pages; + int order; + + if (vma) { + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + if (!orders) + return 0; + } + + /* Find the highest order that can add into the page cache */ + order = highest_order(orders); + while (orders) { + pages = 1UL << order; + aligned_index = round_down(index, pages); /* * Check for conflict before waiting on a huge allocation. * Conflict might be that a huge page has just been allocated @@ -1662,20 +1823,76 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, * Be careful to retry when appropriate, but not forever! * Elsewhere -EEXIST would be the right code, but not here. */ - if (xa_find(&mapping->i_pages, &index, - index + HPAGE_PMD_NR - 1, XA_PRESENT)) - return ERR_PTR(-E2BIG); + if (!xa_find(&mapping->i_pages, &aligned_index, + aligned_index + pages - 1, XA_PRESENT)) + break; + order = next_order(&orders, order); + } - folio = shmem_alloc_hugefolio(gfp, info, index); - if (!folio) - count_vm_event(THP_FILE_FALLBACK); + return orders; +} +#else +static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, + struct address_space *mapping, pgoff_t index, + unsigned long orders) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static struct folio *shmem_alloc_folio(gfp_t gfp, int order, + struct shmem_inode_info *info, pgoff_t index) +{ + struct mempolicy *mpol; + pgoff_t ilx; + struct folio *folio; + + mpol = shmem_get_pgoff_policy(info, index, order, &ilx); + folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); + + return folio; +} + +static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, + gfp_t gfp, struct inode *inode, pgoff_t index, + struct mm_struct *fault_mm, unsigned long orders) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long suitable_orders = 0; + struct folio *folio = NULL; + long pages; + int error, order; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + orders = 0; + + if (orders > 0) { + suitable_orders = shmem_suitable_orders(inode, vmf, + mapping, index, orders); + + order = highest_order(suitable_orders); + while (suitable_orders) { + pages = 1UL << order; + index = round_down(index, pages); + folio = shmem_alloc_folio(gfp, order, info, index); + if (folio) + goto allocated; + + if (pages == HPAGE_PMD_NR) + count_vm_event(THP_FILE_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); + order = next_order(&suitable_orders, order); + } } else { pages = 1; - folio = shmem_alloc_folio(gfp, info, index); + folio = shmem_alloc_folio(gfp, 0, info, index); } if (!folio) return ERR_PTR(-ENOMEM); +allocated: __folio_set_locked(folio); __folio_set_swapbacked(folio); @@ -1685,9 +1902,13 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; - } else if (huge) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); + } else if (pages > 1) { + if (pages == HPAGE_PMD_NR) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); } goto unlock; } @@ -1704,7 +1925,7 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, * Try to reclaim some space by splitting a few * large folios beyond i_size on the filesystem. */ - shmem_unused_huge_shrink(sbinfo, NULL, 2); + shmem_unused_huge_shrink(sbinfo, NULL, pages); /* * And do a shmem_recalc_inode() to account for freed pages: * except our folio is there in cache, so not quite balanced. @@ -1734,6 +1955,65 @@ unlock: return ERR_PTR(error); } +static struct folio *shmem_swap_alloc_folio(struct inode *inode, + struct vm_area_struct *vma, pgoff_t index, + swp_entry_t entry, int order, gfp_t gfp) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct folio *new; + void *shadow; + int nr_pages; + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success with further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } + + new = shmem_alloc_folio(gfp, order, info, index); + if (!new) + return ERR_PTR(-ENOMEM); + + nr_pages = folio_nr_pages(new); + if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, + gfp, entry)) { + folio_put(new); + return ERR_PTR(-ENOMEM); + } + + /* + * Prevent parallel swapin from proceeding with the swap cache flag. + * + * Of course there is another possible concurrent scenario as well, + * that is to say, the swap cache flag of a large folio has already + * been set by swapcache_prepare(), while another thread may have + * already split the large swap entry stored in the shmem mapping. + * In this case, shmem_add_to_page_cache() will help identify the + * concurrent swapin and return -EEXIST. + */ + if (swapcache_prepare(entry, nr_pages)) { + folio_put(new); + return ERR_PTR(-EEXIST); + } + + __folio_set_locked(new); + __folio_set_swapbacked(new); + new->swap = entry; + + memcg1_swapin(entry, nr_pages); + shadow = get_shadow_from_swap_cache(entry); + if (shadow) + workingset_refault(new, shadow); + folio_add_lru(new); + swap_read_folio(new, NULL); + return new; +} + /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of @@ -1752,30 +2032,35 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + struct vm_area_struct *vma) { - struct folio *old, *new; - struct address_space *swap_mapping; - swp_entry_t entry; - pgoff_t swap_index; - int error; - - old = *foliop; - entry = old->swap; - swap_index = swp_offset(entry); - swap_mapping = swap_address_space(entry); + struct folio *new, *old = *foliop; + swp_entry_t entry = old->swap; + struct address_space *swap_mapping = swap_address_space(entry); + pgoff_t swap_index = swap_cache_index(entry); + XA_STATE(xas, &swap_mapping->i_pages, swap_index); + int nr_pages = folio_nr_pages(old); + int error = 0, i; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, info, index); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (nr_pages > 1) { + gfp_t huge_gfp = vma_thp_gfp_mask(vma); + + gfp = limit_gfp_mask(huge_gfp, gfp); + } +#endif + + new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; - folio_get(new); + folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); @@ -1785,26 +2070,32 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - /* - * Our caller will very soon move newpage out of swapcache, but it's - * a nice clean interface for us to replace oldpage by newpage there. - */ + /* Swap cache still stores N entries instead of a high-order entry */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, old, new); + for (i = 0; i < nr_pages; i++) { + void *item = xas_load(&xas); + + if (item != old) { + error = -ENOENT; + break; + } + + xas_store(&xas, new); + xas_next(&xas); + } if (!error) { - mem_cgroup_migrate(old, new); - __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); - __lruvec_stat_mod_folio(new, NR_SHMEM, 1); - __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); - __lruvec_stat_mod_folio(old, NR_SHMEM, -1); + mem_cgroup_replace_folio(old, new); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); } xa_unlock_irq(&swap_mapping->i_pages); if (unlikely(error)) { /* - * Is this possible? I think not, now that our callers check - * both PageSwapCache and page_private after getting page lock; - * but be defensive. Reverse old to newpage for clear and free. + * Is this possible? I think not, now that our callers + * check both the swapcache flag and folio->private + * after getting the folio lock; but be defensive. + * Reverse old to newpage for clear and free. */ old = new; } else { @@ -1816,16 +2107,23 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, old->private = NULL; folio_unlock(old); - folio_put_refs(old, 2); + /* + * The old folio are removed from swap cache, drop the 'nr_pages' + * reference, as well as one temporary reference getting from swap + * cache. + */ + folio_put_refs(old, nr_pages + 1); return error; } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, - struct folio *folio, swp_entry_t swap) + struct folio *folio, swp_entry_t swap, + bool skip_swapcache) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; + int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, @@ -1834,15 +2132,92 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, if (old != swp_to_radix_entry(swap)) return; + nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); - delete_from_swap_cache(folio); + if (!skip_swapcache) + delete_from_swap_cache(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ - shmem_recalc_inode(inode, -1, -1); - swap_free(swap); + shmem_recalc_inode(inode, -nr_pages, -nr_pages); + swap_free_nr(swap, nr_pages); +} + +static int shmem_split_large_entry(struct inode *inode, pgoff_t index, + swp_entry_t swap, gfp_t gfp) +{ + struct address_space *mapping = inode->i_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); + int split_order = 0, entry_order; + int i; + + /* Convert user data gfp flags to xarray node gfp flags */ + gfp &= GFP_RECLAIM_MASK; + + for (;;) { + void *old = NULL; + int cur_order; + pgoff_t swap_index; + + xas_lock_irq(&xas); + old = xas_load(&xas); + if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + + entry_order = xas_get_order(&xas); + + if (!entry_order) + goto unlock; + + /* Try to split large swap entry in pagecache */ + cur_order = entry_order; + swap_index = round_down(index, 1 << entry_order); + + split_order = xas_try_split_min_order(cur_order); + + while (cur_order > 0) { + pgoff_t aligned_index = + round_down(index, 1 << cur_order); + pgoff_t swap_offset = aligned_index - swap_index; + + xas_set_order(&xas, index, split_order); + xas_try_split(&xas, old, cur_order); + if (xas_error(&xas)) + goto unlock; + + /* + * Re-set the swap entry after splitting, and the swap + * offset of the original large entry must be continuous. + */ + for (i = 0; i < 1 << cur_order; + i += (1 << split_order)) { + swp_entry_t tmp; + + tmp = swp_entry(swp_type(swap), + swp_offset(swap) + swap_offset + + i); + __xa_store(&mapping->i_pages, aligned_index + i, + swp_to_radix_entry(tmp), 0); + } + cur_order = split_order; + split_order = xas_try_split_min_order(split_order); + } + +unlock: + xas_unlock_irq(&xas); + + if (!xas_nomem(&xas, gfp)) + break; + } + + if (xas_error(&xas)) + return xas_error(&xas); + + return entry_order; } /* @@ -1853,15 +2228,17 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct mm_struct *fault_mm, + gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; + struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); struct swap_info_struct *si; struct folio *folio = NULL; + bool skip_swapcache = false; swp_entry_t swap; - int error; + int error, nr_pages, order, split_order; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); swap = radix_to_swp_entry(*foliop); @@ -1880,26 +2257,110 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap, NULL, 0); + order = xa_get_order(&mapping->i_pages, index); if (!folio) { + int nr_pages = 1 << order; + bool fallback_order0 = false; + /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } + + /* + * If uffd is active for the vma, we need per-page fault + * fidelity to maintain the uffd semantics, then fallback + * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. + */ + if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || + !zswap_never_enabled() || + non_swapcache_batch(swap, nr_pages) != nr_pages)) + fallback_order0 = true; + + /* Skip swapcache for synchronous device. */ + if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { + folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); + if (!IS_ERR(folio)) { + skip_swapcache = true; + goto alloced; + } + + /* + * Fallback to swapin order-0 folio unless the swap entry + * already exists. + */ + error = PTR_ERR(folio); + folio = NULL; + if (error == -EEXIST) + goto failed; + } + + /* + * Now swap device can only swap in order 0 folio, then we + * should split the large swap entry stored in the pagecache + * if necessary. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } + /* Here we actually start the io */ folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; } + } else if (order != folio_order(folio)) { + /* + * Swap readahead may swap in order 0 folios into swapcache + * asynchronously, while the shmem mapping can still stores + * large swap entries. In such cases, we should split the + * large swap entry to prevent possible data corruption. + */ + split_order = shmem_split_large_entry(inode, index, swap, gfp); + if (split_order < 0) { + folio_put(folio); + folio = NULL; + error = split_order; + goto failed; + } + + /* + * If the large swap entry has already been split, it is + * necessary to recalculate the new swap entry based on + * the old order alignment. + */ + if (split_order > 0) { + pgoff_t offset = index - round_down(index, 1 << split_order); + + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); + } } +alloced: /* We have to do this with folio locked to prevent races */ folio_lock(folio); - if (!folio_test_swapcache(folio) || + if ((!skip_swapcache && !folio_test_swapcache(folio)) || folio->swap.val != swap.val || - !shmem_confirm_swap(mapping, index, swap)) { + !shmem_confirm_swap(mapping, index, swap) || + xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { error = -EEXIST; goto unlock; } @@ -1908,32 +2369,39 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } folio_wait_writeback(folio); + nr_pages = folio_nr_pages(folio); /* * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ - arch_swap_restore(swap, folio); + arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_folio(&folio, gfp, info, index); + error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } - error = shmem_add_to_page_cache(folio, mapping, index, + error = shmem_add_to_page_cache(folio, mapping, + round_down(index, nr_pages), swp_to_radix_entry(swap), gfp); if (error) goto failed; - shmem_recalc_inode(inode, 0, -1); + shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); - delete_from_swap_cache(folio); + if (skip_swapcache) { + folio->swap.val = 0; + swapcache_clear(si, swap, nr_pages); + } else { + delete_from_swap_cache(folio); + } folio_mark_dirty(folio); - swap_free(swap); + swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; @@ -1942,8 +2410,11 @@ failed: if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; if (error == -EIO) - shmem_set_folio_swapin_error(inode, index, folio, swap); + shmem_set_folio_swapin_error(inode, index, folio, swap, + skip_swapcache); unlock: + if (skip_swapcache) + swapcache_clear(si, swap, folio_nr_pages(folio)); if (folio) { folio_unlock(folio); folio_put(folio); @@ -1963,14 +2434,15 @@ unlock: * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_fault *vmf, vm_fault_t *fault_type) + loff_t write_end, struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; struct folio *folio; int error; bool alloced; + unsigned long orders = 0; if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) return -EINVAL; @@ -1995,7 +2467,7 @@ repeat: if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, fault_mm, fault_type); + sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; @@ -2042,23 +2514,26 @@ repeat: return 0; } - if (shmem_is_huge(inode, index, false, fault_mm, - vma ? vma->vm_flags : 0)) { + /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ + orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); + if (orders > 0) { gfp_t huge_gfp; huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_add_folio(huge_gfp, - inode, index, fault_mm, true); + folio = shmem_alloc_and_add_folio(vmf, huge_gfp, + inode, index, fault_mm, orders); if (!IS_ERR(folio)) { - count_vm_event(THP_FILE_ALLOC); + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_FILE_ALLOC); + count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); goto alloced; } if (PTR_ERR(folio) == -EEXIST) goto repeat; } - folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false); + folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); if (IS_ERR(folio)) { error = PTR_ERR(folio); if (error == -EEXIST) @@ -2069,9 +2544,9 @@ repeat: alloced: alloced = true; - if (folio_test_pmd_mappable(folio) && + if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - folio_next_index(folio) - 1) { + folio_next_index(folio)) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); /* @@ -2141,6 +2616,7 @@ unlock: * shmem_get_folio - find, and lock a shmem folio. * @inode: inode to search * @index: the page index. + * @write_end: end of a write, could extend inode size * @foliop: pointer to the folio if found * @sgp: SGP_* flags to control behavior * @@ -2160,10 +2636,10 @@ unlock: * Context: May sleep. * Return: 0 if successful, else a negative error code. */ -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp) +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp) { - return shmem_get_folio_gfp(inode, index, foliop, sgp, + return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } EXPORT_SYMBOL_GPL(shmem_get_folio); @@ -2258,7 +2734,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) } WARN_ON_ONCE(vmf->page != NULL); - err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); @@ -2273,19 +2749,18 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { - unsigned long (*get_area)(struct file *, - unsigned long, unsigned long, unsigned long, unsigned long); unsigned long addr; unsigned long offset; unsigned long inflated_len; unsigned long inflated_addr; unsigned long inflated_offset; + unsigned long hpage_size; if (len > TASK_SIZE) return -ENOMEM; - get_area = current->mm->get_unmapped_area; - addr = get_area(file, uaddr, len, pgoff, flags); + addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff, + flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; @@ -2298,8 +2773,6 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (shmem_huge == SHMEM_HUGE_DENY) return addr; - if (len < HPAGE_PMD_SIZE) - return addr; if (flags & MAP_FIXED) return addr; /* @@ -2311,8 +2784,11 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (uaddr == addr) return addr; + hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; + unsigned long __maybe_unused hpage_orders; + int order = 0; if (file) { VM_BUG_ON(file->f_op != &shmem_file_operations); @@ -2325,33 +2801,54 @@ unsigned long shmem_get_unmapped_area(struct file *file, if (IS_ERR(shm_mnt)) return addr; sb = shm_mnt->mnt_sb; + + /* + * Find the highest mTHP order used for anonymous shmem to + * provide a suitable alignment address. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + hpage_orders = READ_ONCE(huge_shmem_orders_always); + hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); + hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); + if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) + hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); + + if (hpage_orders > 0) { + order = highest_order(hpage_orders); + hpage_size = PAGE_SIZE << order; + } +#endif } - if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) + if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr; } - offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); - if (offset && offset + len < 2 * HPAGE_PMD_SIZE) + if (len < hpage_size) + return addr; + + offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); + if (offset && offset + len < 2 * hpage_size) return addr; - if ((addr & (HPAGE_PMD_SIZE-1)) == offset) + if ((addr & (hpage_size - 1)) == offset) return addr; - inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; + inflated_len = len + hpage_size - PAGE_SIZE; if (inflated_len > TASK_SIZE) return addr; if (inflated_len < len) return addr; - inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); + inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr, + inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) return addr; - inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); + inflated_offset = inflated_addr & (hpage_size - 1); inflated_addr += offset - inflated_offset; if (inflated_offset > offset) - inflated_addr += HPAGE_PMD_SIZE; + inflated_addr += hpage_size; if (inflated_addr > TASK_SIZE - len) return addr; @@ -2433,15 +2930,6 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); - int ret; - - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; - - /* arm64 - allow memory tagging on RAM-based files */ - vm_flags_set(vma, VM_MTE_ALLOWED); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ @@ -2461,13 +2949,62 @@ static int shmem_file_open(struct inode *inode, struct file *file) #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); +#if IS_ENABLED(CONFIG_UNICODE) +/* + * shmem_inode_casefold_flags - Deal with casefold file attribute flag + * + * The casefold file attribute needs some special checks. I can just be added to + * an empty dir, and can't be removed from a non-empty dir. + */ +static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, + struct dentry *dentry, unsigned int *i_flags) +{ + unsigned int old = inode->i_flags; + struct super_block *sb = inode->i_sb; + + if (fsflags & FS_CASEFOLD_FL) { + if (!(old & S_CASEFOLD)) { + if (!sb->s_encoding) + return -EOPNOTSUPP; + + if (!S_ISDIR(inode->i_mode)) + return -ENOTDIR; + + if (dentry && !simple_empty(dentry)) + return -ENOTEMPTY; + } + + *i_flags = *i_flags | S_CASEFOLD; + } else if (old & S_CASEFOLD) { + if (dentry && !simple_empty(dentry)) + return -ENOTEMPTY; + } + + return 0; +} +#else +static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, + struct dentry *dentry, unsigned int *i_flags) +{ + if (fsflags & FS_CASEFOLD_FL) + return -EOPNOTSUPP; + + return 0; +} +#endif + /* * chattr's fsflags are unrelated to extended attributes, * but tmpfs has chosen to enable them under the same config option. */ -static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { unsigned int i_flags = 0; + int ret; + + ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); + if (ret) + return ret; if (fsflags & FS_NOATIME_FL) i_flags |= S_NOATIME; @@ -2478,10 +3015,12 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) /* * But FS_NODUMP_FL does not require any action in i_flags. */ - inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); + inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD); + + return 0; } #else -static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { } #define shmem_initxattrs NULL @@ -2528,14 +3067,17 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; if (info->fsflags) - shmem_set_inode_flags(inode, info->fsflags); + shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); - mapping_set_large_folios(inode->i_mapping); + + /* Don't consider 'deny' for emergencies and 'force' for testing */ + if (sbinfo->huge) + mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: @@ -2640,7 +3182,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, info, pgoff); + folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks; @@ -2730,7 +3272,7 @@ static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -2747,27 +3289,25 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; - *pagep = folio_file_page(folio, index); - if (PageHWPoison(*pagep)) { + if (folio_contain_hwpoisoned_page(folio)) { folio_unlock(folio); folio_put(folio); - *pagep = NULL; return -EIO; } + *foliop = folio; return 0; } static int shmem_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; if (pos + copied > inode->i_size) @@ -2797,28 +3337,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) unsigned long offset; int error = 0; ssize_t retval = 0; - loff_t *ppos = &iocb->ki_pos; - - index = *ppos >> PAGE_SHIFT; - offset = *ppos & ~PAGE_MASK; for (;;) { struct folio *folio = NULL; struct page *page = NULL; - pgoff_t end_index; unsigned long nr, ret; - loff_t i_size = i_size_read(inode); + loff_t end_offset, i_size = i_size_read(inode); + bool fallback_page_copy = false; + size_t fsize; - end_index = i_size >> PAGE_SHIFT; - if (index > end_index) + if (unlikely(iocb->ki_pos >= i_size)) break; - if (index == end_index) { - nr = i_size & ~PAGE_MASK; - if (nr <= offset) - break; - } - error = shmem_get_folio(inode, index, &folio, SGP_READ); + index = iocb->ki_pos >> PAGE_SHIFT; + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -2833,24 +3365,29 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) error = -EIO; break; } + + if (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio)) + fallback_page_copy = true; } /* * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ - nr = PAGE_SIZE; i_size = i_size_read(inode); - end_index = i_size >> PAGE_SHIFT; - if (index == end_index) { - nr = i_size & ~PAGE_MASK; - if (nr <= offset) { - if (folio) - folio_put(folio); - break; - } + if (unlikely(iocb->ki_pos >= i_size)) { + if (folio) + folio_put(folio); + break; } - nr -= offset; + end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); + if (folio && likely(!fallback_page_copy)) + fsize = folio_size(folio); + else + fsize = PAGE_SIZE; + offset = iocb->ki_pos & (fsize - 1); + nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); if (folio) { /* @@ -2858,10 +3395,15 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + if (mapping_writably_mapped(mapping)) { + if (likely(!fallback_page_copy)) + flush_dcache_folio(folio); + else + flush_dcache_page(page); + } + /* - * Mark the page accessed if we read the beginning. + * Mark the folio accessed if we read the beginning. */ if (!offset) folio_mark_accessed(folio); @@ -2869,9 +3411,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ - ret = copy_page_to_iter(page, offset, nr, to); + if (likely(!fallback_page_copy)) + ret = copy_folio_to_iter(folio, offset, nr, to); + else + ret = copy_page_to_iter(page, offset, nr, to); folio_put(folio); - } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but @@ -2889,9 +3433,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } retval += ret; - offset += ret; - index += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; + iocb->ki_pos += ret; if (!iov_iter_count(to)) break; @@ -2902,7 +3444,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) cond_resched(); } - *ppos = ((loff_t) index << PAGE_SHIFT) + offset; file_accessed(file); return retval ? retval : error; } @@ -2959,7 +3500,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, size = min_t(size_t, size, PAGE_SIZE - offset); - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { @@ -2986,16 +3527,21 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, int error = 0; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); do { + bool fallback_page_splice = false; + struct page *page = NULL; + pgoff_t index; + size_t size; + if (*ppos >= i_size_read(inode)) break; - error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, - SGP_READ); + index = *ppos >> PAGE_SHIFT; + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -3004,12 +3550,15 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (folio) { folio_unlock(folio); - if (folio_test_hwpoison(folio) || - (folio_test_large(folio) && - folio_test_has_hwpoisoned(folio))) { + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { error = -EIO; break; } + + if (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio)) + fallback_page_splice = true; } /* @@ -3023,7 +3572,17 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, isize = i_size_read(inode); if (unlikely(*ppos >= isize)) break; - part = min_t(loff_t, isize - *ppos, len); + /* + * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned + * pages. + */ + size = len; + if (unlikely(fallback_page_splice)) { + size_t offset = *ppos & ~PAGE_MASK; + + size = umin(size, PAGE_SIZE - offset); + } + part = min_t(loff_t, isize - *ppos, size); if (folio) { /* @@ -3031,8 +3590,12 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (mapping_writably_mapped(mapping)) - flush_dcache_folio(folio); + if (mapping_writably_mapped(mapping)) { + if (likely(!fallback_page_splice)) + flush_dcache_folio(folio); + else + flush_dcache_page(page); + } folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so we can @@ -3051,7 +3614,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) break; cond_resched(); @@ -3173,16 +3736,19 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, struct folio *folio; /* - * Good, the fallocate(2) manpage permits EINTR: we may have - * been interrupted because we are using up too much memory. + * Check for fatal signal so that we abort early in OOM + * situations. We don't want to abort in case of non-fatal + * signals as large fallocate can take noticeable time and + * e.g. periodic timers may result in fallocate constantly + * restarting. */ - if (signal_pending(current)) + if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_get_folio(inode, index, &folio, - SGP_FALLOC); + error = shmem_get_folio(inode, index, offset + len, + &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ @@ -3205,7 +3771,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, index--; /* - * Inform shmem_writepage() how far we have reached. + * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) @@ -3272,6 +3838,9 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode; int error; + if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) + return -EINVAL; + inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -3291,7 +3860,12 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); - d_instantiate(dentry, inode); + + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_add(dentry, inode); + else + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ return error; @@ -3328,16 +3902,16 @@ out_iput: return error; } -static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) - return error; + return ERR_PTR(error); inc_nlink(dir); - return 0; + return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, @@ -3382,7 +3956,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ - d_instantiate(dentry, inode); + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_add(dentry, inode); + else + d_instantiate(dentry, inode); out: return ret; } @@ -3402,12 +3979,20 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - does all the work */ + + /* + * For now, VFS can't deal with case-insensitive negative dentries, so + * we invalidate them + */ + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_invalidate(dentry); + return 0; } static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { - if (!simple_offset_empty(dentry)) + if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); @@ -3464,7 +4049,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - if (!simple_offset_empty(new_dentry)) + if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { @@ -3473,8 +4058,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return error; } - simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry); - error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry); + error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (error) return error; @@ -3504,6 +4088,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, int len; struct inode *inode; struct folio *folio; + char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3525,16 +4110,17 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - inode->i_link = kmemdup(symname, len, GFP_KERNEL); - if (!inode->i_link) { + link = kmemdup(symname, len, GFP_KERNEL); + if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; + inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; @@ -3547,7 +4133,10 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); - d_instantiate(dentry, inode); + if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) + d_add(dentry, inode); + else + d_instantiate(dentry, inode); dget(dentry); return 0; @@ -3580,7 +4169,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, return ERR_PTR(-ECHILD); } } else { - error = shmem_get_folio(inode, 0, &folio, SGP_READ); + error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) @@ -3612,16 +4201,23 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap, { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); + int ret, flags; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) return -EOPNOTSUPP; - info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); - shmem_set_inode_flags(inode, info->fsflags); + ret = shmem_set_inode_flags(inode, flags, dentry); + + if (ret) + return ret; + + info->fsflags = flags; + inode_set_ctime_current(inode); inode_inc_iversion(inode); return 0; @@ -3900,6 +4496,9 @@ enum shmem_param { Opt_usrquota_inode_hardlimit, Opt_grpquota_block_hardlimit, Opt_grpquota_inode_hardlimit, + Opt_casefold_version, + Opt_casefold, + Opt_strict_encoding, }; static const struct constant_table shmem_param_enums_huge[] = { @@ -3911,14 +4510,14 @@ static const struct constant_table shmem_param_enums_huge[] = { }; const struct fs_parameter_spec shmem_fs_parameters[] = { - fsparam_u32 ("gid", Opt_gid), + fsparam_gid ("gid", Opt_gid), fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), fsparam_u32oct("mode", Opt_mode), fsparam_string("mpol", Opt_mpol), fsparam_string("nr_blocks", Opt_nr_blocks), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("size", Opt_size), - fsparam_u32 ("uid", Opt_uid), + fsparam_uid ("uid", Opt_uid), fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), fsparam_flag ("noswap", Opt_noswap), @@ -3931,9 +4530,54 @@ const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), #endif + fsparam_string("casefold", Opt_casefold_version), + fsparam_flag ("casefold", Opt_casefold), + fsparam_flag ("strict_encoding", Opt_strict_encoding), {} }; +#if IS_ENABLED(CONFIG_UNICODE) +static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, + bool latest_version) +{ + struct shmem_options *ctx = fc->fs_private; + int version = UTF8_LATEST; + struct unicode_map *encoding; + char *version_str = param->string + 5; + + if (!latest_version) { + if (strncmp(param->string, "utf8-", 5)) + return invalfc(fc, "Only UTF-8 encodings are supported " + "in the format: utf8-<version number>"); + + version = utf8_parse_version(version_str); + if (version < 0) + return invalfc(fc, "Invalid UTF-8 version: %s", version_str); + } + + encoding = utf8_load(version); + + if (IS_ERR(encoding)) { + return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n", + unicode_major(version), unicode_minor(version), + unicode_rev(version)); + } + + pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n", + unicode_major(version), unicode_minor(version), unicode_rev(version)); + + ctx->encoding = encoding; + + return 0; +} +#else +static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, + bool latest_version) +{ + return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); +} +#endif + static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { struct shmem_options *ctx = fc->fs_private; @@ -3978,9 +4622,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->mode = result.uint_32 & 07777; break; case Opt_uid: - kuid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(kuid)) - goto bad_value; + kuid = result.uid; /* * The requested uid must be representable in the @@ -3992,9 +4634,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->uid = kuid; break; case Opt_gid: - kgid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(kgid)) - goto bad_value; + kgid = result.gid; /* * The requested gid must be representable in the @@ -4096,6 +4736,17 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) "Group quota inode hardlimit too large."); ctx->qlimits.grpquota_ihardlimit = size; break; + case Opt_casefold_version: + return shmem_parse_opt_casefold(fc, param, false); + case Opt_casefold: + return shmem_parse_opt_casefold(fc, param, true); + case Opt_strict_encoding: +#if IS_ENABLED(CONFIG_UNICODE) + ctx->strict_encoding = true; + break; +#else + return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); +#endif } return 0; @@ -4105,48 +4756,37 @@ bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } -static int shmem_parse_options(struct fs_context *fc, void *data) +static char *shmem_next_opt(char **s) { - char *options = data; + char *sbegin = *s; + char *p; - if (options) { - int err = security_sb_eat_lsm_opts(options, &fc->security); - if (err) - return err; - } + if (sbegin == NULL) + return NULL; - while (options != NULL) { - char *this_char = options; - for (;;) { - /* - * NUL-terminate this option: unfortunately, - * mount options form a comma-separated list, - * but mpol's nodelist may also contain commas. - */ - options = strchr(options, ','); - if (options == NULL) - break; - options++; - if (!isdigit(*options)) { - options[-1] = '\0'; - break; - } - } - if (*this_char) { - char *value = strchr(this_char, '='); - size_t len = 0; - int err; - - if (value) { - *value++ = '\0'; - len = strlen(value); - } - err = vfs_parse_fs_string(fc, this_char, value, len); - if (err < 0) - return err; + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + for (;;) { + p = strchr(*s, ','); + if (p == NULL) + break; + *s = p + 1; + if (!isdigit(*(p+1))) { + *p = '\0'; + return sbegin; } } - return 0; + + *s = NULL; + return sbegin; +} + +static int shmem_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* @@ -4325,6 +4965,11 @@ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); +#if IS_ENABLED(CONFIG_UNICODE) + if (sb->s_encoding) + utf8_unload(sb->s_encoding); +#endif + #ifdef CONFIG_TMPFS_QUOTA shmem_disable_quotas(sb); #endif @@ -4335,6 +4980,14 @@ static void shmem_put_super(struct super_block *sb) sb->s_fs_info = NULL; } +#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS) +static const struct dentry_operations shmem_ci_dentry_ops = { + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, + .d_delete = always_delete_dentry, +}; +#endif + static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; @@ -4369,9 +5022,25 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) } sb->s_export_op = &shmem_export_ops; sb->s_flags |= SB_NOSEC | SB_I_VERSION; + +#if IS_ENABLED(CONFIG_UNICODE) + if (!ctx->encoding && ctx->strict_encoding) { + pr_err("tmpfs: strict_encoding option without encoding is forbidden\n"); + error = -EINVAL; + goto failed; + } + + if (ctx->encoding) { + sb->s_encoding = ctx->encoding; + sb->s_d_op = &shmem_ci_dentry_ops; + if (ctx->strict_encoding) + sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; + } +#endif + #else sb->s_flags |= SB_NOUSER; -#endif +#endif /* CONFIG_TMPFS */ sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; @@ -4384,7 +5053,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; - sbinfo->huge = ctx->huge; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + else + sbinfo->huge = tmpfs_huge; +#endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; @@ -4462,7 +5136,7 @@ static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS - .parse_monolithic = shmem_parse_options, + .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif @@ -4520,7 +5194,6 @@ static int shmem_error_remove_folio(struct address_space *mapping, } static const struct address_space_operations shmem_aops = { - .writepage = shmem_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, @@ -4645,6 +5318,10 @@ int shmem_init_fs_context(struct fs_context *fc) ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); +#if IS_ENABLED(CONFIG_UNICODE) + ctx->encoding = NULL; +#endif + fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; return 0; @@ -4658,9 +5335,69 @@ static struct file_system_type shmem_fs_type = { .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, +}; + +#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) + +#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ +{ \ + .attr = { .name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ +} + +#define TMPFS_ATTR_W(_name, _store) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + +#define TMPFS_ATTR_RW(_name, _show, _store) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define TMPFS_ATTR_RO(_name, _show) \ + static struct kobj_attribute tmpfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + +#if IS_ENABLED(CONFIG_UNICODE) +static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a, + char *buf) +{ + return sysfs_emit(buf, "supported\n"); +} +TMPFS_ATTR_RO(casefold, casefold_show); +#endif + +static struct attribute *tmpfs_attributes[] = { +#if IS_ENABLED(CONFIG_UNICODE) + &tmpfs_attr_casefold.attr, +#endif + NULL }; +static const struct attribute_group tmpfs_attribute_group = { + .attrs = tmpfs_attributes, + .name = "features" +}; + +static struct kobject *tmpfs_kobj; + +static int __init tmpfs_sysfs_init(void) +{ + int ret; + + tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj); + if (!tmpfs_kobj) + return -ENOMEM; + + ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group); + if (ret) + kobject_put(tmpfs_kobj); + + return ret; +} +#endif /* CONFIG_SYSFS && CONFIG_TMPFS */ + void __init shmem_init(void) { int error; @@ -4668,11 +5405,7 @@ void __init shmem_init(void) shmem_init_inodecache(); #ifdef CONFIG_TMPFS_QUOTA - error = register_quota_format(&shmem_quota_format); - if (error < 0) { - pr_err("Could not register quota format\n"); - goto out3; - } + register_quota_format(&shmem_quota_format); #endif error = register_filesystem(&shmem_fs_type); @@ -4688,11 +5421,26 @@ void __init shmem_init(void) goto out1; } +#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) + error = tmpfs_sysfs_init(); + if (error) { + pr_err("Could not init tmpfs sysfs\n"); + goto out1; + } +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ + + /* + * Default to setting PMD-sized THP to inherit the global setting and + * disable all other multi-size THPs. + */ + if (!shmem_orders_configured) + huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; @@ -4701,7 +5449,6 @@ out1: out2: #ifdef CONFIG_TMPFS_QUOTA unregister_quota_format(&shmem_quota_format); -out3: #endif shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); @@ -4736,7 +5483,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { char tmp[16]; - int huge; + int huge, err; if (count + 1 > sizeof(tmp)) return -EINVAL; @@ -4747,20 +5494,238 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, huge = shmem_parse_huge(tmp); if (huge == -EINVAL) - return -EINVAL; - if (!has_transparent_hugepage() && - huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) - return -EINVAL; + return huge; shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; - return count; + + err = start_stop_khugepaged(); + return err ? err : count; } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); +static DEFINE_SPINLOCK(huge_shmem_orders_lock); + +static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + const char *output; + + if (test_bit(order, &huge_shmem_orders_always)) + output = "[always] inherit within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_inherit)) + output = "always [inherit] within_size advise never"; + else if (test_bit(order, &huge_shmem_orders_within_size)) + output = "always inherit [within_size] advise never"; + else if (test_bit(order, &huge_shmem_orders_madvise)) + output = "always inherit within_size [advise] never"; + else + output = "always inherit within_size advise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_always); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "inherit")) { + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (shmem_huge == SHMEM_HUGE_FORCE && + order != HPAGE_PMD_ORDER) + return -EINVAL; + + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_madvise); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_inherit); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "within_size")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_madvise); + set_bit(order, &huge_shmem_orders_within_size); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "advise")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + set_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else if (sysfs_streq(buf, "never")) { + spin_lock(&huge_shmem_orders_lock); + clear_bit(order, &huge_shmem_orders_always); + clear_bit(order, &huge_shmem_orders_inherit); + clear_bit(order, &huge_shmem_orders_within_size); + clear_bit(order, &huge_shmem_orders_madvise); + spin_unlock(&huge_shmem_orders_lock); + } else { + ret = -EINVAL; + } + + if (ret > 0) { + int err = start_stop_khugepaged(); + + if (err) + ret = err; + } + return ret; +} + +struct kobj_attribute thpsize_shmem_enabled_attr = + __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) + +static int __init setup_transparent_hugepage_shmem(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge == -EINVAL) { + pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n"); + return huge; + } + + shmem_huge = huge; + return 1; +} +__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); + +static int __init setup_transparent_hugepage_tmpfs(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge < 0) { + pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); + return huge; + } + + tmpfs_huge = huge; + return 1; +} +__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); + +static char str_dup[PAGE_SIZE] __initdata; +static int __init setup_thp_shmem(char *str) +{ + char *token, *range, *policy, *subtoken; + unsigned long always, inherit, madvise, within_size; + char *start_size, *end_size; + int start, end, nr; + char *p; + + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strscpy(str_dup, str); + + always = huge_shmem_orders_always; + inherit = huge_shmem_orders_inherit; + madvise = huge_shmem_orders_madvise; + within_size = huge_shmem_orders_within_size; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; + + if (!policy) + goto err; + + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; + + start = get_order_from_str(start_size, + THP_ORDERS_ALL_FILE_DEFAULT); + end = get_order_from_str(end_size, + THP_ORDERS_ALL_FILE_DEFAULT); + } else { + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_FILE_DEFAULT); + } + + if (start < 0) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + start_size); + goto err; + } + + if (end < 0) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + end_size); + goto err; + } + + if (start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "advise")) { + bitmap_set(&madvise, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "inherit")) { + bitmap_set(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "within_size")) { + bitmap_set(&within_size, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + } else if (!strcmp(policy, "never")) { + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else { + pr_err("invalid policy %s in thp_shmem boot parameter\n", policy); + goto err; + } + } + } + + huge_shmem_orders_always = always; + huge_shmem_orders_madvise = madvise; + huge_shmem_orders_inherit = inherit; + huge_shmem_orders_within_size = within_size; + shmem_orders_configured = true; + return 1; + +err: + pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str); + return 0; +} +__setup("thp_shmem=", setup_thp_shmem); + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #else /* !CONFIG_SHMEM */ /* @@ -4807,7 +5772,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #endif @@ -4847,12 +5812,12 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); - if (shmem_acct_size(flags, size)) - return ERR_PTR(-ENOMEM); - if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { @@ -4877,7 +5842,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -4889,7 +5854,7 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -4902,7 +5867,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created - * @name: name for dentry (to be seen in /proc/<pid>/maps + * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ @@ -4963,7 +5928,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, + error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); |