summaryrefslogtreecommitdiff
path: root/mm/shmem.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c1082
1 files changed, 651 insertions, 431 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index ccb9629a0f70..3f194c9842a8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,7 +66,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#include <linux/falloc.h>
#include <linux/splice.h>
#include <linux/security.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
#include <linux/ctype.h>
@@ -86,7 +86,6 @@ static struct vfsmount *shm_mnt __ro_after_init;
#include "internal.h"
-#define BLOCKS_PER_PAGE (PAGE_SIZE/512)
#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
/* Pretend that each entry is of this size in directory's i_size */
@@ -99,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * shmem_fallocate communicates with shmem_fault or shmem_writeout via
* inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
@@ -108,7 +107,7 @@ struct shmem_falloc {
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
- pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+ pgoff_t nr_unswapped; /* how often writeout refused to swap out */
};
struct shmem_options {
@@ -132,8 +131,7 @@ struct shmem_options {
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
-#define SHMEM_SEEN_NOSWAP 16
-#define SHMEM_SEEN_QUOTA 32
+#define SHMEM_SEEN_QUOTA 16
};
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -176,20 +174,20 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
*/
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
- return (flags & VM_NORESERVE) ?
+ return (flags & SHMEM_F_NORESERVE) ?
0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
}
static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
- if (!(flags & VM_NORESERVE))
+ if (!(flags & SHMEM_F_NORESERVE))
vm_unacct_memory(VM_ACCT(size));
}
static inline int shmem_reacct_size(unsigned long flags,
loff_t oldsize, loff_t newsize)
{
- if (!(flags & VM_NORESERVE)) {
+ if (!(flags & SHMEM_F_NORESERVE)) {
if (VM_ACCT(newsize) > VM_ACCT(oldsize))
return security_vm_enough_memory_mm(current->mm,
VM_ACCT(newsize) - VM_ACCT(oldsize));
@@ -207,7 +205,7 @@ static inline int shmem_reacct_size(unsigned long flags,
*/
static inline int shmem_acct_blocks(unsigned long flags, long pages)
{
- if (!(flags & VM_NORESERVE))
+ if (!(flags & SHMEM_F_NORESERVE))
return 0;
return security_vm_enough_memory_mm(current->mm,
@@ -216,11 +214,11 @@ static inline int shmem_acct_blocks(unsigned long flags, long pages)
static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
- if (flags & VM_NORESERVE)
+ if (flags & SHMEM_F_NORESERVE)
vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
-static int shmem_inode_acct_blocks(struct inode *inode, long pages)
+int shmem_inode_acct_blocks(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@ -276,24 +274,24 @@ static const struct vm_operations_struct shmem_vm_ops;
static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;
-bool shmem_mapping(struct address_space *mapping)
+bool shmem_mapping(const struct address_space *mapping)
{
return mapping->a_ops == &shmem_aops;
}
EXPORT_SYMBOL_GPL(shmem_mapping);
-bool vma_is_anon_shmem(struct vm_area_struct *vma)
+bool vma_is_anon_shmem(const struct vm_area_struct *vma)
{
return vma->vm_ops == &shmem_anon_vm_ops;
}
-bool vma_is_shmem(struct vm_area_struct *vma)
+bool vma_is_shmem(const struct vm_area_struct *vma)
{
return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
}
static LIST_HEAD(shmem_swaplist);
-static DEFINE_MUTEX(shmem_swaplist_mutex);
+static DEFINE_SPINLOCK(shmem_swaplist_lock);
#ifdef CONFIG_TMPFS_QUOTA
@@ -433,10 +431,13 @@ static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
*
* But normally info->alloced == inode->i_mapping->nrpages + info->swapped
* So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
+ *
+ * Return: true if swapped was incremented from 0, for shmem_writeout().
*/
-static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
+bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ bool first_swapped = false;
long freed;
spin_lock(&info->lock);
@@ -447,12 +448,15 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/*
* Special case: whereas normally shmem_recalc_inode() is called
* after i_mapping->nrpages has already been adjusted (up or down),
- * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * shmem_writeout() has to raise swapped before nrpages is lowered -
* to stop a racing shmem_recalc_inode() from thinking that a page has
* been freed. Compensate here, to avoid the need for a followup call.
*/
- if (swapped > 0)
+ if (swapped > 0) {
+ if (info->swapped == swapped)
+ first_swapped = true;
freed += swapped;
+ }
if (freed > 0)
info->alloced -= freed;
spin_unlock(&info->lock);
@@ -460,6 +464,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/* The quota case may block */
if (freed > 0)
shmem_inode_unacct_blocks(inode, freed);
+ return first_swapped;
}
bool shmem_charge(struct inode *inode, long pages)
@@ -506,15 +511,27 @@ static int shmem_replace_entry(struct address_space *mapping,
/*
* Sometimes, before we decide whether to proceed or to fail, we must check
- * that an entry was not already brought back from swap by a racing thread.
+ * that an entry was not already brought back or split by a racing thread.
*
* Checking folio is not enough: by the time a swapcache folio is locked, it
* might be reused, and again be swapcache, using the same swap as before.
+ * Returns the swap entry's order if it still presents, else returns -1.
*/
-static bool shmem_confirm_swap(struct address_space *mapping,
- pgoff_t index, swp_entry_t swap)
+static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index,
+ swp_entry_t swap)
{
- return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
+ XA_STATE(xas, &mapping->i_pages, index);
+ int ret = -1;
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_load(&xas);
+ if (entry == swp_to_radix_entry(swap))
+ ret = xas_get_order(&xas);
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -526,9 +543,9 @@ static bool shmem_confirm_swap(struct address_space *mapping,
* enables huge pages for the mount;
* SHMEM_HUGE_WITHIN_SIZE:
* only allocate huge pages if the page will be fully within i_size,
- * also respect fadvise()/madvise() hints;
+ * also respect madvise() hints;
* SHMEM_HUGE_ADVISE:
- * only allocate huge pages if requested with fadvise()/madvise();
+ * only allocate huge pages if requested with madvise();
*/
#define SHMEM_HUGE_NEVER 0
@@ -552,39 +569,101 @@ static bool shmem_confirm_swap(struct address_space *mapping,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */
-static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER)
+#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS)
+#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE)
+#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE)
+#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE
+#else
+#define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER
+#endif
-static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- unsigned long vm_flags)
+static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT;
+
+#undef SHMEM_HUGE_DEFAULT
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER)
+#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS)
+#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE)
+#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE)
+#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE
+#else
+#define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER
+#endif
+
+static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT;
+
+#undef TMPFS_HUGE_DEFAULT
+
+static unsigned int shmem_get_orders_within_size(struct inode *inode,
+ unsigned long within_size_orders, pgoff_t index,
+ loff_t write_end)
{
+ pgoff_t aligned_index;
+ unsigned long order;
loff_t i_size;
- if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
- return false;
+ order = highest_order(within_size_orders);
+ while (within_size_orders) {
+ aligned_index = round_up(index + 1, 1 << order);
+ i_size = max(write_end, i_size_read(inode));
+ i_size = round_up(i_size, PAGE_SIZE);
+ if (i_size >> PAGE_SHIFT >= aligned_index)
+ return within_size_orders;
+
+ order = next_order(&within_size_orders, order);
+ }
+
+ return 0;
+}
+
+static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
+{
+ unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
+ 0 : BIT(HPAGE_PMD_ORDER);
+ unsigned long within_size_orders;
+
if (!S_ISREG(inode->i_mode))
- return false;
+ return 0;
if (shmem_huge == SHMEM_HUGE_DENY)
- return false;
+ return 0;
if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
- return true;
+ return maybe_pmd_order;
+ /*
+ * The huge order allocation for anon shmem is controlled through
+ * the mTHP interface, so we still use PMD-sized huge order to
+ * check whether global control is enabled.
+ *
+ * For tmpfs with 'huge=always' or 'huge=within_size' mount option,
+ * we will always try PMD-sized order first. If that failed, it will
+ * fall back to small large folios.
+ */
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
- return true;
+ return THP_ORDERS_ALL_FILE_DEFAULT;
case SHMEM_HUGE_WITHIN_SIZE:
- index = round_up(index + 1, HPAGE_PMD_NR);
- i_size = max(write_end, i_size_read(inode));
- i_size = round_up(i_size, PAGE_SIZE);
- if (i_size >> PAGE_SHIFT >= index)
- return true;
+ within_size_orders = shmem_get_orders_within_size(inode,
+ THP_ORDERS_ALL_FILE_DEFAULT, index, write_end);
+ if (within_size_orders > 0)
+ return within_size_orders;
+
fallthrough;
case SHMEM_HUGE_ADVISE:
if (vm_flags & VM_HUGEPAGE)
- return true;
+ return THP_ORDERS_ALL_FILE_DEFAULT;
fallthrough;
default:
- return false;
+ return 0;
}
}
@@ -779,23 +858,34 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
return 0;
}
-static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
- loff_t write_end, bool shmem_huge_force,
- unsigned long vm_flags)
+static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
{
- return false;
+ return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static void shmem_update_stats(struct folio *folio, int nr_pages)
+{
+ if (folio_test_pmd_mappable(folio))
+ lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
+}
+
/*
* Somewhat like filemap_add_folio, but error if expected item has gone.
*/
-static int shmem_add_to_page_cache(struct folio *folio,
- struct address_space *mapping,
- pgoff_t index, void *expected, gfp_t gfp)
+int shmem_add_to_page_cache(struct folio *folio,
+ struct address_space *mapping,
+ pgoff_t index, void *expected, gfp_t gfp)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
- long nr = folio_nr_pages(folio);
+ unsigned long nr = folio_nr_pages(folio);
+ swp_entry_t iter, swap;
+ void *entry;
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -807,24 +897,32 @@ static int shmem_add_to_page_cache(struct folio *folio,
gfp &= GFP_RECLAIM_MASK;
folio_throttle_swaprate(folio, gfp);
+ swap = radix_to_swp_entry(expected);
do {
+ iter = swap;
xas_lock_irq(&xas);
- if (expected != xas_find_conflict(&xas)) {
- xas_set_err(&xas, -EEXIST);
- goto unlock;
+ xas_for_each_conflict(&xas, entry) {
+ /*
+ * The range must either be empty, or filled with
+ * expected swap entries. Shmem swap entries are never
+ * partially freed without split of both entry and
+ * folio, so there shouldn't be any holes.
+ */
+ if (!expected || entry != swp_to_radix_entry(iter)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ iter.val += 1 << xas_get_order(&xas);
}
- if (expected && xas_find_conflict(&xas)) {
+ if (expected && iter.val - nr != swap.val) {
xas_set_err(&xas, -EEXIST);
goto unlock;
}
xas_store(&xas, folio);
if (xas_error(&xas))
goto unlock;
- if (folio_test_pmd_mappable(folio))
- __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
- __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
- __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
+ shmem_update_stats(folio, nr);
mapping->nrpages += nr;
unlock:
xas_unlock_irq(&xas);
@@ -852,8 +950,7 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
error = shmem_replace_entry(mapping, folio->index, folio, radswap);
folio->mapping = NULL;
mapping->nrpages -= nr;
- __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
- __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
+ shmem_update_stats(folio, -nr);
xa_unlock_irq(&mapping->i_pages);
folio_put_refs(folio, nr);
BUG_ON(error);
@@ -889,15 +986,15 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
+ struct folio *folio;
unsigned long swapped = 0;
unsigned long max = end - 1;
rcu_read_lock();
- xas_for_each(&xas, page, max) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, max) {
+ if (xas_retry(&xas, folio))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(folio))
swapped += 1 << xas_get_order(&xas);
if (xas.xa_index == max)
break;
@@ -996,7 +1093,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
* Remove range of pages and swap entries from page cache, and free them.
* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/
-static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend,
bool unfalloc)
{
struct address_space *mapping = inode->i_mapping;
@@ -1053,7 +1150,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
if (folio) {
- same_folio = lend < folio_pos(folio) + folio_size(folio);
+ same_folio = lend < folio_next_pos(folio);
folio_mark_dirty(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
start = folio_next_index(folio);
@@ -1147,7 +1244,7 @@ whole_folios:
shmem_recalc_inode(inode, 0, -nr_swaps_freed);
}
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
@@ -1176,7 +1273,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
STATX_ATTR_NODUMP);
generic_fillattr(idmap, request_mask, inode, stat);
- if (shmem_huge_global_enabled(inode, 0, 0, false, 0))
+ if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) {
@@ -1217,6 +1314,8 @@ static int shmem_setattr(struct mnt_idmap *idmap,
return -EPERM;
if (newsize != oldsize) {
+ if (info->flags & SHMEM_F_MAPPING_FROZEN)
+ return -EPERM;
error = shmem_reacct_size(SHMEM_I(inode)->flags,
oldsize, newsize);
if (error)
@@ -1290,11 +1389,11 @@ static void shmem_evict_inode(struct inode *inode)
/* Wait while shmem_unuse() is scanning this inode... */
wait_var_event(&info->stop_eviction,
!atomic_read(&info->stop_eviction));
- mutex_lock(&shmem_swaplist_mutex);
+ spin_lock(&shmem_swaplist_lock);
/* ...but beware of the race if we peeked too early */
if (!atomic_read(&info->stop_eviction))
list_del_init(&info->swaplist);
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
}
}
@@ -1308,9 +1407,9 @@ static void shmem_evict_inode(struct inode *inode)
#endif
}
-static int shmem_find_swap_entries(struct address_space *mapping,
- pgoff_t start, struct folio_batch *fbatch,
- pgoff_t *indices, unsigned int type)
+static unsigned int shmem_find_swap_entries(struct address_space *mapping,
+ pgoff_t start, struct folio_batch *fbatch,
+ pgoff_t *indices, unsigned int type)
{
XA_STATE(xas, &mapping->i_pages, start);
struct folio *folio;
@@ -1343,7 +1442,7 @@ static int shmem_find_swap_entries(struct address_space *mapping,
}
rcu_read_unlock();
- return xas.xa_index;
+ return folio_batch_count(fbatch);
}
/*
@@ -1361,8 +1460,6 @@ static int shmem_unuse_swap_entries(struct inode *inode,
for (i = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
- if (!xa_is_value(folio))
- continue;
error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
mapping_gfp_mask(mapping), NULL, NULL);
if (error == 0) {
@@ -1390,8 +1487,8 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type)
do {
folio_batch_init(&fbatch);
- shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
- if (folio_batch_count(&fbatch) == 0) {
+ if (!shmem_find_swap_entries(mapping, start, &fbatch,
+ indices, type)) {
ret = 0;
break;
}
@@ -1419,7 +1516,8 @@ int shmem_unuse(unsigned int type)
if (list_empty(&shmem_swaplist))
return 0;
- mutex_lock(&shmem_swaplist_mutex);
+ spin_lock(&shmem_swaplist_lock);
+start_over:
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
@@ -1432,51 +1530,47 @@ int shmem_unuse(unsigned int type)
* (igrab() would protect from unlink, but not from unmount).
*/
atomic_inc(&info->stop_eviction);
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
error = shmem_unuse_inode(&info->vfs_inode, type);
cond_resched();
- mutex_lock(&shmem_swaplist_mutex);
- next = list_next_entry(info, swaplist);
- if (!info->swapped)
- list_del_init(&info->swaplist);
+ spin_lock(&shmem_swaplist_lock);
if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction);
if (error)
break;
+ if (list_empty(&info->swaplist))
+ goto start_over;
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
}
- mutex_unlock(&shmem_swaplist_mutex);
+ spin_unlock(&shmem_swaplist_lock);
return error;
}
-/*
- * Move the page from the page cache to the swap cache.
+/**
+ * shmem_writeout - Write the folio to swap
+ * @folio: The folio to write
+ * @plug: swap plug
+ * @folio_list: list to put back folios on split
+ *
+ * Move the folio from the page cache to the swap cache.
*/
-static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
+ struct list_head *folio_list)
{
- struct folio *folio = page_folio(page);
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- swp_entry_t swap;
pgoff_t index;
int nr_pages;
bool split = false;
- /*
- * Our capabilities prevent regular writeback or sync from ever calling
- * shmem_writepage; but a stacking filesystem might use ->writepage of
- * its underlying filesystem, in which case tmpfs should write out to
- * swap only in response to memory pressure, and not for the writeback
- * threads or sync.
- */
- if (WARN_ON_ONCE(!wbc->for_reclaim))
- goto redirty;
-
- if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
+ if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap)
goto redirty;
if (!total_swap_pages)
@@ -1502,9 +1596,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
try_split:
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
- if (split_huge_page_to_list_to_order(page, wbc->list, 0))
+ if (split_folio_to_list(folio, folio_list))
goto redirty;
- folio = page_folio(page);
folio_clear_dirty(folio);
}
@@ -1531,7 +1624,7 @@ try_split:
!shmem_falloc->waitq &&
index >= shmem_falloc->start &&
index < shmem_falloc->next)
- shmem_falloc->nr_unswapped++;
+ shmem_falloc->nr_unswapped += nr_pages;
else
shmem_falloc = NULL;
spin_unlock(&inode->i_lock);
@@ -1543,47 +1636,66 @@ try_split:
folio_mark_uptodate(folio);
}
- swap = folio_alloc_swap(folio);
- if (!swap.val) {
- if (nr_pages > 1)
- goto try_split;
+ if (!folio_alloc_swap(folio)) {
+ bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages);
+ int error;
- goto redirty;
- }
+ /*
+ * Add inode to shmem_unuse()'s list of swapped-out inodes,
+ * if it's not already there. Do it now before the folio is
+ * removed from page cache, when its pagelock no longer
+ * protects the inode from eviction. And do it now, after
+ * we've incremented swapped, because shmem_unuse() will
+ * prune a !swapped inode from the swaplist.
+ */
+ if (first_swapped) {
+ spin_lock(&shmem_swaplist_lock);
+ if (list_empty(&info->swaplist))
+ list_add(&info->swaplist, &shmem_swaplist);
+ spin_unlock(&shmem_swaplist_lock);
+ }
+
+ swap_shmem_alloc(folio->swap, nr_pages);
+ shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
- /*
- * Add inode to shmem_unuse()'s list of swapped-out inodes,
- * if it's not already there. Do it now before the folio is
- * moved to swap cache, when its pagelock no longer protects
- * the inode from eviction. But don't unlock the mutex until
- * we've incremented swapped, because shmem_unuse_inode() will
- * prune a !swapped inode from the swaplist under this mutex.
- */
- mutex_lock(&shmem_swaplist_mutex);
- if (list_empty(&info->swaplist))
- list_add(&info->swaplist, &shmem_swaplist);
-
- if (add_to_swap_cache(folio, swap,
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
- NULL) == 0) {
- shmem_recalc_inode(inode, 0, nr_pages);
- swap_shmem_alloc(swap, nr_pages);
- shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
-
- mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
- return swap_writepage(&folio->page, wbc);
- }
+ error = swap_writeout(folio, plug);
+ if (error != AOP_WRITEPAGE_ACTIVATE) {
+ /* folio has been unlocked */
+ return error;
+ }
- mutex_unlock(&shmem_swaplist_mutex);
- put_swap_folio(folio, swap);
+ /*
+ * The intention here is to avoid holding on to the swap when
+ * zswap was unable to compress and unable to writeback; but
+ * it will be appropriate if other reactivate cases are added.
+ */
+ error = shmem_add_to_page_cache(folio, mapping, index,
+ swp_to_radix_entry(folio->swap),
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ /* Swap entry might be erased by racing shmem_free_swap() */
+ if (!error) {
+ shmem_recalc_inode(inode, 0, -nr_pages);
+ swap_free_nr(folio->swap, nr_pages);
+ }
+
+ /*
+ * The swap_cache_del_folio() below could be left for
+ * shrink_folio_list()'s folio_free_swap() to dispose of;
+ * but I'm a little nervous about letting this folio out of
+ * shmem_writeout() in a hybrid half-tmpfs-half-swap state
+ * e.g. folio_mapping(folio) might give an unexpected answer.
+ */
+ swap_cache_del_folio(folio);
+ goto redirty;
+ }
+ if (nr_pages > 1)
+ goto try_split;
redirty:
folio_mark_dirty(folio);
- if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
- folio_unlock(folio);
- return 0;
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
}
+EXPORT_SYMBOL_GPL(shmem_writeout);
#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
@@ -1684,23 +1796,17 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
{
unsigned long mask = READ_ONCE(huge_shmem_orders_always);
unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
- unsigned long vm_flags = vma ? vma->vm_flags : 0;
- bool global_huge;
- loff_t i_size;
- int order;
+ vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
+ unsigned int global_orders;
- if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
+ if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force)))
return 0;
- global_huge = shmem_huge_global_enabled(inode, index, write_end,
- shmem_huge_force, vm_flags);
- if (!vma || !vma_is_anon_shmem(vma)) {
- /*
- * For tmpfs, we now only support PMD sized THP if huge page
- * is enabled, otherwise fallback to order 0.
- */
- return global_huge ? BIT(HPAGE_PMD_ORDER) : 0;
- }
+ global_orders = shmem_huge_global_enabled(inode, index, write_end,
+ shmem_huge_force, vma, vm_flags);
+ /* Tmpfs huge pages allocation */
+ if (!vma || !vma_is_anon_shmem(vma))
+ return global_orders;
/*
* Following the 'deny' semantics of the top level, force the huge
@@ -1717,22 +1823,12 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
return READ_ONCE(huge_shmem_orders_inherit);
/* Allow mTHP that will be fully within i_size. */
- order = highest_order(within_size_orders);
- while (within_size_orders) {
- index = round_up(index + 1, order);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >> PAGE_SHIFT >= index) {
- mask |= within_size_orders;
- break;
- }
-
- order = next_order(&within_size_orders, order);
- }
+ mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);
if (vm_flags & VM_HUGEPAGE)
mask |= READ_ONCE(huge_shmem_orders_madvise);
- if (global_huge)
+ if (global_orders > 0)
mask |= READ_ONCE(huge_shmem_orders_inherit);
return THP_ORDERS_ALL_FILE_DEFAULT & mask;
@@ -1805,6 +1901,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long suitable_orders = 0;
struct folio *folio = NULL;
+ pgoff_t aligned_index;
long pages;
int error, order;
@@ -1818,10 +1915,12 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
order = highest_order(suitable_orders);
while (suitable_orders) {
pages = 1UL << order;
- index = round_down(index, pages);
- folio = shmem_alloc_folio(gfp, order, info, index);
- if (folio)
+ aligned_index = round_down(index, pages);
+ folio = shmem_alloc_folio(gfp, order, info, aligned_index);
+ if (folio) {
+ index = aligned_index;
goto allocated;
+ }
if (pages == HPAGE_PMD_NR)
count_vm_event(THP_FILE_FALLBACK);
@@ -1898,6 +1997,93 @@ unlock:
return ERR_PTR(error);
}
+static struct folio *shmem_swap_alloc_folio(struct inode *inode,
+ struct vm_area_struct *vma, pgoff_t index,
+ swp_entry_t entry, int order, gfp_t gfp)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ int nr_pages = 1 << order;
+ struct folio *new;
+ gfp_t alloc_gfp;
+ void *shadow;
+
+ /*
+ * We have arrived here because our zones are constrained, so don't
+ * limit chance of success with further cpuset and node constraints.
+ */
+ gfp &= ~GFP_CONSTRAINT_MASK;
+ alloc_gfp = gfp;
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (WARN_ON_ONCE(order))
+ return ERR_PTR(-EINVAL);
+ } else if (order) {
+ /*
+ * If uffd is active for the vma, we need per-page fault
+ * fidelity to maintain the uffd semantics, then fallback
+ * to swapin order-0 folio, as well as for zswap case.
+ * Any existing sub folio in the swap cache also blocks
+ * mTHP swapin.
+ */
+ if ((vma && unlikely(userfaultfd_armed(vma))) ||
+ !zswap_never_enabled() ||
+ non_swapcache_batch(entry, nr_pages) != nr_pages)
+ goto fallback;
+
+ alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp);
+ }
+retry:
+ new = shmem_alloc_folio(alloc_gfp, order, info, index);
+ if (!new) {
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
+ }
+
+ if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
+ alloc_gfp, entry)) {
+ folio_put(new);
+ new = ERR_PTR(-ENOMEM);
+ goto fallback;
+ }
+
+ /*
+ * Prevent parallel swapin from proceeding with the swap cache flag.
+ *
+ * Of course there is another possible concurrent scenario as well,
+ * that is to say, the swap cache flag of a large folio has already
+ * been set by swapcache_prepare(), while another thread may have
+ * already split the large swap entry stored in the shmem mapping.
+ * In this case, shmem_add_to_page_cache() will help identify the
+ * concurrent swapin and return -EEXIST.
+ */
+ if (swapcache_prepare(entry, nr_pages)) {
+ folio_put(new);
+ new = ERR_PTR(-EEXIST);
+ /* Try smaller folio to avoid cache conflict */
+ goto fallback;
+ }
+
+ __folio_set_locked(new);
+ __folio_set_swapbacked(new);
+ new->swap = entry;
+
+ memcg1_swapin(entry, nr_pages);
+ shadow = swap_cache_get_shadow(entry);
+ if (shadow)
+ workingset_refault(new, shadow);
+ folio_add_lru(new);
+ swap_read_folio(new, NULL);
+ return new;
+fallback:
+ /* Order 0 swapin failed, nothing to fallback to, abort */
+ if (!order)
+ return new;
+ entry.val += index - round_down(index, nr_pages);
+ alloc_gfp = gfp;
+ nr_pages = 1;
+ order = 0;
+ goto retry;
+}
+
/*
* When a page is moved from swapcache to shmem filecache (either by the
* usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
@@ -1919,13 +2105,11 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index,
struct vm_area_struct *vma)
{
+ struct swap_cluster_info *ci;
struct folio *new, *old = *foliop;
swp_entry_t entry = old->swap;
- struct address_space *swap_mapping = swap_address_space(entry);
- pgoff_t swap_index = swap_cache_index(entry);
- XA_STATE(xas, &swap_mapping->i_pages, swap_index);
int nr_pages = folio_nr_pages(old);
- int error = 0, i;
+ int error = 0;
/*
* We have arrived here because our zones are constrained, so don't
@@ -1954,40 +2138,15 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
new->swap = entry;
folio_set_swapcache(new);
- /* Swap cache still stores N entries instead of a high-order entry */
- xa_lock_irq(&swap_mapping->i_pages);
- for (i = 0; i < nr_pages; i++) {
- void *item = xas_load(&xas);
+ ci = swap_cluster_get_and_lock_irq(old);
+ __swap_cache_replace_folio(ci, old, new);
+ mem_cgroup_replace_folio(old, new);
+ shmem_update_stats(new, nr_pages);
+ shmem_update_stats(old, -nr_pages);
+ swap_cluster_unlock_irq(ci);
- if (item != old) {
- error = -ENOENT;
- break;
- }
-
- xas_store(&xas, new);
- xas_next(&xas);
- }
- if (!error) {
- mem_cgroup_replace_folio(old, new);
- __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages);
- __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages);
- __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages);
- __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages);
- }
- xa_unlock_irq(&swap_mapping->i_pages);
-
- if (unlikely(error)) {
- /*
- * Is this possible? I think not, now that our callers
- * check both the swapcache flag and folio->private
- * after getting the folio lock; but be defensive.
- * Reverse old to newpage for clear and free.
- */
- old = new;
- } else {
- folio_add_lru(new);
- *foliop = new;
- }
+ folio_add_lru(new);
+ *foliop = new;
folio_clear_swapcache(old);
old->private = NULL;
@@ -2003,7 +2162,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
}
static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
- struct folio *folio, swp_entry_t swap)
+ struct folio *folio, swp_entry_t swap,
+ bool skip_swapcache)
{
struct address_space *mapping = inode->i_mapping;
swp_entry_t swapin_error;
@@ -2019,7 +2179,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
nr_pages = folio_nr_pages(folio);
folio_wait_writeback(folio);
- delete_from_swap_cache(folio);
+ if (!skip_swapcache)
+ swap_cache_del_folio(folio);
/*
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
@@ -2034,15 +2195,16 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
{
struct address_space *mapping = inode->i_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
- void *alloced_shadow = NULL;
- int alloced_order = 0, i;
+ int split_order = 0;
+ int i;
/* Convert user data gfp flags to xarray node gfp flags */
gfp &= GFP_RECLAIM_MASK;
for (;;) {
- int order = -1, split_order = 0;
void *old = NULL;
+ int cur_order;
+ pgoff_t swap_index;
xas_lock_irq(&xas);
old = xas_load(&xas);
@@ -2051,60 +2213,53 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
goto unlock;
}
- order = xas_get_order(&xas);
-
- /* Swap entry may have changed before we re-acquire the lock */
- if (alloced_order &&
- (old != alloced_shadow || order != alloced_order)) {
- xas_destroy(&xas);
- alloced_order = 0;
- }
+ cur_order = xas_get_order(&xas);
+ if (!cur_order)
+ goto unlock;
/* Try to split large swap entry in pagecache */
- if (order > 0) {
- if (!alloced_order) {
- split_order = order;
+ swap_index = round_down(index, 1 << cur_order);
+ split_order = xas_try_split_min_order(cur_order);
+
+ while (cur_order > 0) {
+ pgoff_t aligned_index =
+ round_down(index, 1 << cur_order);
+ pgoff_t swap_offset = aligned_index - swap_index;
+
+ xas_set_order(&xas, index, split_order);
+ xas_try_split(&xas, old, cur_order);
+ if (xas_error(&xas))
goto unlock;
- }
- xas_split(&xas, old, order);
/*
* Re-set the swap entry after splitting, and the swap
* offset of the original large entry must be continuous.
*/
- for (i = 0; i < 1 << order; i++) {
- pgoff_t aligned_index = round_down(index, 1 << order);
+ for (i = 0; i < 1 << cur_order;
+ i += (1 << split_order)) {
swp_entry_t tmp;
- tmp = swp_entry(swp_type(swap), swp_offset(swap) + i);
+ tmp = swp_entry(swp_type(swap),
+ swp_offset(swap) + swap_offset +
+ i);
__xa_store(&mapping->i_pages, aligned_index + i,
swp_to_radix_entry(tmp), 0);
}
+ cur_order = split_order;
+ split_order = xas_try_split_min_order(split_order);
}
unlock:
xas_unlock_irq(&xas);
- /* split needed, alloc here and retry. */
- if (split_order) {
- xas_split_alloc(&xas, old, split_order, gfp);
- if (xas_error(&xas))
- goto error;
- alloced_shadow = old;
- alloced_order = split_order;
- xas_reset(&xas);
- continue;
- }
-
if (!xas_nomem(&xas, gfp))
break;
}
-error:
if (xas_error(&xas))
return xas_error(&xas);
- return alloced_order;
+ return 0;
}
/*
@@ -2121,73 +2276,112 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
struct shmem_inode_info *info = SHMEM_I(inode);
+ swp_entry_t swap;
+ softleaf_t index_entry;
struct swap_info_struct *si;
struct folio *folio = NULL;
- swp_entry_t swap;
- int error, nr_pages;
+ bool skip_swapcache = false;
+ int error, nr_pages, order;
+ pgoff_t offset;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
- swap = radix_to_swp_entry(*foliop);
+ index_entry = radix_to_swp_entry(*foliop);
+ swap = index_entry;
*foliop = NULL;
- if (is_poisoned_swp_entry(swap))
+ if (softleaf_is_poison_marker(index_entry))
return -EIO;
- si = get_swap_device(swap);
- if (!si) {
- if (!shmem_confirm_swap(mapping, index, swap))
+ si = get_swap_device(index_entry);
+ order = shmem_confirm_swap(mapping, index, index_entry);
+ if (unlikely(!si)) {
+ if (order < 0)
return -EEXIST;
else
return -EINVAL;
}
+ if (unlikely(order < 0)) {
+ put_swap_device(si);
+ return -EEXIST;
+ }
+
+ /* index may point to the middle of a large entry, get the sub entry */
+ if (order) {
+ offset = index - round_down(index, 1 << order);
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
/* Look it up and read it in.. */
- folio = swap_cache_get_folio(swap, NULL, 0);
+ folio = swap_cache_get_folio(swap);
if (!folio) {
- int split_order;
-
- /* Or update major stats only when swapin succeeds?? */
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
+ /* Direct swapin skipping swap cache & readahead */
+ folio = shmem_swap_alloc_folio(inode, vma, index,
+ index_entry, order, gfp);
+ if (IS_ERR(folio)) {
+ error = PTR_ERR(folio);
+ folio = NULL;
+ goto failed;
+ }
+ skip_swapcache = true;
+ } else {
+ /* Cached swapin only supports order 0 folio */
+ folio = shmem_swapin_cluster(swap, gfp, info, index);
+ if (!folio) {
+ error = -ENOMEM;
+ goto failed;
+ }
+ }
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
+ } else {
+ swap_update_readahead(folio, NULL, 0);
+ }
+ if (order > folio_order(folio)) {
/*
- * Now swap device can only swap in order 0 folio, then we
- * should split the large swap entry stored in the pagecache
- * if necessary.
- */
- split_order = shmem_split_large_entry(inode, index, swap, gfp);
- if (split_order < 0) {
- error = split_order;
- goto failed;
- }
-
- /*
- * If the large swap entry has already been split, it is
- * necessary to recalculate the new swap entry based on
- * the old order alignment.
+ * Swapin may get smaller folios due to various reasons:
+ * It may fallback to order 0 due to memory pressure or race,
+ * swap readahead may swap in order 0 folios into swapcache
+ * asynchronously, while the shmem mapping can still stores
+ * large swap entries. In such cases, we should split the
+ * large swap entry to prevent possible data corruption.
*/
- if (split_order > 0) {
- pgoff_t offset = index - round_down(index, 1 << split_order);
-
- swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
- }
+ error = shmem_split_large_entry(inode, index, index_entry, gfp);
+ if (error)
+ goto failed_nolock;
+ }
- /* Here we actually start the io */
- folio = shmem_swapin_cluster(swap, gfp, info, index);
- if (!folio) {
- error = -ENOMEM;
- goto failed;
- }
+ /*
+ * If the folio is large, round down swap and index by folio size.
+ * No matter what race occurs, the swap layer ensures we either get
+ * a valid folio that has its swap entry aligned by size, or a
+ * temporarily invalid one which we'll abort very soon and retry.
+ *
+ * shmem_add_to_page_cache ensures the whole range contains expected
+ * entries and prevents any corruption, so any race split is fine
+ * too, it will succeed as long as the entries are still there.
+ */
+ nr_pages = folio_nr_pages(folio);
+ if (nr_pages > 1) {
+ swap.val = round_down(swap.val, nr_pages);
+ index = round_down(index, nr_pages);
}
- /* We have to do this with folio locked to prevent races */
+ /*
+ * We have to do this with the folio locked to prevent races.
+ * The shmem_confirm_swap below only checks if the first swap
+ * entry matches the folio, that's enough to ensure the folio
+ * is not used outside of shmem, as shmem swap entries
+ * and swap cache folios are never partially freed.
+ */
folio_lock(folio);
- if (!folio_test_swapcache(folio) ||
- folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
+ if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
+ shmem_confirm_swap(mapping, index, swap) < 0 ||
+ folio->swap.val != swap.val) {
error = -EEXIST;
goto unlock;
}
@@ -2196,7 +2390,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
goto failed;
}
folio_wait_writeback(folio);
- nr_pages = folio_nr_pages(folio);
/*
* Some architectures may have to restore extra metadata to the
@@ -2210,8 +2403,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
goto failed;
}
- error = shmem_add_to_page_cache(folio, mapping,
- round_down(index, nr_pages),
+ error = shmem_add_to_page_cache(folio, mapping, index,
swp_to_radix_entry(swap), gfp);
if (error)
goto failed;
@@ -2221,7 +2413,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
- delete_from_swap_cache(folio);
+ if (skip_swapcache) {
+ folio->swap.val = 0;
+ swapcache_clear(si, swap, nr_pages);
+ } else {
+ swap_cache_del_folio(folio);
+ }
folio_mark_dirty(folio);
swap_free_nr(swap, nr_pages);
put_swap_device(si);
@@ -2229,15 +2426,19 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
*foliop = folio;
return 0;
failed:
- if (!shmem_confirm_swap(mapping, index, swap))
+ if (shmem_confirm_swap(mapping, index, swap) < 0)
error = -EEXIST;
if (error == -EIO)
- shmem_set_folio_swapin_error(inode, index, folio, swap);
+ shmem_set_folio_swapin_error(inode, index, folio, swap,
+ skip_swapcache);
unlock:
- if (folio) {
+ if (folio)
folio_unlock(folio);
+failed_nolock:
+ if (skip_swapcache)
+ swapcache_clear(si, folio->swap, folio_nr_pages(folio));
+ if (folio)
folio_put(folio);
- }
put_swap_device(si);
return error;
@@ -2578,8 +2779,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
if (len > TASK_SIZE)
return -ENOMEM;
- addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
- flags);
+ addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags);
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return addr;
@@ -2657,8 +2857,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
if (inflated_len < len)
return addr;
- inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
- inflated_len, 0, flags);
+ inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags);
if (IS_ERR_VALUE(inflated_addr))
return addr;
if (inflated_addr & ~PAGE_MASK)
@@ -2729,15 +2928,15 @@ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
* ipc_lock_object() when called from shmctl_do_lock(),
* no serialization needed when called from shm_destroy().
*/
- if (lock && !(info->flags & VM_LOCKED)) {
+ if (lock && !(info->flags & SHMEM_F_LOCKED)) {
if (!user_shm_lock(inode->i_size, ucounts))
goto out_nomem;
- info->flags |= VM_LOCKED;
+ info->flags |= SHMEM_F_LOCKED;
mapping_set_unevictable(file->f_mapping);
}
- if (!lock && (info->flags & VM_LOCKED) && ucounts) {
+ if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) {
user_shm_unlock(inode->i_size, ucounts);
- info->flags &= ~VM_LOCKED;
+ info->flags &= ~SHMEM_F_LOCKED;
mapping_clear_unevictable(file->f_mapping);
}
retval = 0;
@@ -2746,22 +2945,17 @@ out_nomem:
return retval;
}
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+static int shmem_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
- struct shmem_inode_info *info = SHMEM_I(inode);
- int ret;
-
- ret = seal_check_write(info->seals, vma);
- if (ret)
- return ret;
file_accessed(file);
/* This is anonymous shared memory if it is unlinked at the time of mmap */
if (inode->i_nlink)
- vma->vm_ops = &shmem_vm_ops;
+ desc->vm_ops = &shmem_vm_ops;
else
- vma->vm_ops = &shmem_anon_vm_ops;
+ desc->vm_ops = &shmem_anon_vm_ops;
return 0;
}
@@ -2887,7 +3081,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
spin_lock_init(&info->lock);
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
- info->flags = flags & VM_NORESERVE;
+ info->flags = (flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
info->i_crtime = inode_get_mtime(inode);
info->fsflags = (dir == NULL) ? 0 :
SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
@@ -3095,9 +3289,9 @@ static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
static int
-shmem_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -3114,12 +3308,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
+ if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) &&
+ pos + len > inode->i_size))
+ return -EPERM;
+
ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
if (ret)
return ret;
- if (folio_test_hwpoison(folio) ||
- (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
+ if (folio_contain_hwpoisoned_page(folio)) {
folio_unlock(folio);
folio_put(folio);
return -EIO;
@@ -3130,9 +3327,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
}
static int
-shmem_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+shmem_write_end(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
@@ -3326,7 +3523,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
size = min_t(size_t, size, PAGE_SIZE - offset);
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ if (!pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
*buf = (struct pipe_buffer) {
@@ -3353,7 +3550,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
int error = 0;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -3440,7 +3637,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
break;
cond_resched();
@@ -3488,6 +3685,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
inode_lock(inode);
+ if (info->flags & SHMEM_F_MAPPING_FROZEN) {
+ error = -EPERM;
+ goto out;
+ }
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
struct address_space *mapping = file->f_mapping;
loff_t unmap_start = round_up(offset, PAGE_SIZE);
@@ -3597,7 +3799,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
index--;
/*
- * Inform shmem_writepage() how far we have reached.
+ * Inform shmem_writeout() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
if (!folio_test_uptodate(folio))
@@ -3687,12 +3889,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
- if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
- d_add(dentry, inode);
- else
- d_instantiate(dentry, inode);
-
- dget(dentry); /* Extra count - pin the dentry in core */
+ d_make_persistent(dentry, inode);
return error;
out_iput:
@@ -3728,16 +3925,16 @@ out_iput:
return error;
}
-static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
if (error)
- return error;
+ return ERR_PTR(error);
inc_nlink(dir);
- return 0;
+ return NULL;
}
static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
@@ -3753,7 +3950,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
- int ret = 0;
+ int ret;
/*
* No ordinary (disk based) filesystem counts links as inodes;
@@ -3765,29 +3962,19 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink) {
ret = shmem_reserve_inode(inode->i_sb, NULL);
if (ret)
- goto out;
+ return ret;
}
ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
if (ret) {
if (inode->i_nlink)
shmem_free_inode(inode->i_sb, 0);
- goto out;
+ return ret;
}
dir->i_size += BOGO_DIRENT_SIZE;
- inode_set_mtime_to_ts(dir,
- inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inode_inc_iversion(dir);
- inc_nlink(inode);
- ihold(inode); /* New dentry reference */
- dget(dentry); /* Extra pinning count for the created dentry */
- if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
- d_add(dentry, inode);
- else
- d_instantiate(dentry, inode);
-out:
- return ret;
+ return simple_link(old_dentry, dir, dentry);
}
static int shmem_unlink(struct inode *dir, struct dentry *dentry)
@@ -3800,11 +3987,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
dir->i_size -= BOGO_DIRENT_SIZE;
- inode_set_mtime_to_ts(dir,
- inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
inode_inc_iversion(dir);
- drop_nlink(inode);
- dput(dentry); /* Undo the count from "create" - does all the work */
+ simple_unlink(dir, dentry);
/*
* For now, VFS can't deal with case-insensitive negative dentries, so
@@ -3818,7 +4002,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
{
- if (!simple_offset_empty(dentry))
+ if (!simple_empty(dentry))
return -ENOTEMPTY;
drop_nlink(d_inode(dentry));
@@ -3875,7 +4059,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
return simple_offset_rename_exchange(old_dir, old_dentry,
new_dir, new_dentry);
- if (!simple_offset_empty(new_dentry))
+ if (!simple_empty(new_dentry))
return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
@@ -3914,6 +4098,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
int len;
struct inode *inode;
struct folio *folio;
+ char *link;
len = strlen(symname) + 1;
if (len > PAGE_SIZE)
@@ -3935,12 +4120,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
- inode->i_link = kmemdup(symname, len, GFP_KERNEL);
- if (!inode->i_link) {
+ link = kmemdup(symname, len, GFP_KERNEL);
+ if (!link) {
error = -ENOMEM;
goto out_remove_offset;
}
inode->i_op = &shmem_short_symlink_operations;
+ inode_set_cached_link(inode, link, len - 1);
} else {
inode_nohighmem(inode);
inode->i_mapping->a_ops = &shmem_aops;
@@ -3957,11 +4143,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
dir->i_size += BOGO_DIRENT_SIZE;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
- if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
- d_add(dentry, inode);
- else
- d_instantiate(dentry, inode);
- dget(dentry);
+ d_make_persistent(dentry, inode);
return 0;
out_remove_offset:
@@ -4011,7 +4193,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
#ifdef CONFIG_TMPFS_XATTR
-static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
@@ -4021,7 +4203,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
static int shmem_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -4365,7 +4547,7 @@ static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *
bool latest_version)
{
struct shmem_options *ctx = fc->fs_private;
- unsigned int version = UTF8_LATEST;
+ int version = UTF8_LATEST;
struct unicode_map *encoding;
char *version_str = param->string + 5;
@@ -4504,7 +4686,6 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
"Turning off swap in unprivileged tmpfs mounts unsupported");
}
ctx->noswap = true;
- ctx->seen |= SHMEM_SEEN_NOSWAP;
break;
case Opt_quota:
if (fc->user_ns != &init_user_ns)
@@ -4580,48 +4761,37 @@ bad_value:
return invalfc(fc, "Bad value for '%s'", param->key);
}
-static int shmem_parse_options(struct fs_context *fc, void *data)
+static char *shmem_next_opt(char **s)
{
- char *options = data;
+ char *sbegin = *s;
+ char *p;
- if (options) {
- int err = security_sb_eat_lsm_opts(options, &fc->security);
- if (err)
- return err;
- }
+ if (sbegin == NULL)
+ return NULL;
- while (options != NULL) {
- char *this_char = options;
- for (;;) {
- /*
- * NUL-terminate this option: unfortunately,
- * mount options form a comma-separated list,
- * but mpol's nodelist may also contain commas.
- */
- options = strchr(options, ',');
- if (options == NULL)
- break;
- options++;
- if (!isdigit(*options)) {
- options[-1] = '\0';
- break;
- }
- }
- if (*this_char) {
- char *value = strchr(this_char, '=');
- size_t len = 0;
- int err;
-
- if (value) {
- *value++ = '\0';
- len = strlen(value);
- }
- err = vfs_parse_fs_string(fc, this_char, value, len);
- if (err < 0)
- return err;
+ /*
+ * NUL-terminate this option: unfortunately,
+ * mount options form a comma-separated list,
+ * but mpol's nodelist may also contain commas.
+ */
+ for (;;) {
+ p = strchr(*s, ',');
+ if (p == NULL)
+ break;
+ *s = p + 1;
+ if (!isdigit(*(p+1))) {
+ *p = '\0';
+ return sbegin;
}
}
- return 0;
+
+ *s = NULL;
+ return sbegin;
+}
+
+static int shmem_parse_monolithic(struct fs_context *fc, void *data)
+{
+ return vfs_parse_monolithic_sep(fc, data, shmem_next_opt);
}
/*
@@ -4665,14 +4835,15 @@ static int shmem_reconfigure(struct fs_context *fc)
err = "Current inum too high to switch to 32-bit inums";
goto out;
}
- if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
+
+ /*
+ * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap"
+ * counterpart for (re-)enabling swap.
+ */
+ if (ctx->noswap && !sbinfo->noswap) {
err = "Cannot disable swap on remount";
goto out;
}
- if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
- err = "Cannot enable swap on remount if it was disabled on first mount";
- goto out;
- }
if (ctx->seen & SHMEM_SEEN_QUOTA &&
!sb_any_quota_loaded(fc->root->d_sb)) {
@@ -4819,7 +4990,6 @@ static void shmem_put_super(struct super_block *sb)
static const struct dentry_operations shmem_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
- .d_delete = always_delete_dentry,
};
#endif
@@ -4856,7 +5026,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= SB_NOSEC | SB_I_VERSION;
+ sb->s_flags |= SB_NOSEC;
#if IS_ENABLED(CONFIG_UNICODE)
if (!ctx->encoding && ctx->strict_encoding) {
@@ -4867,7 +5037,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
if (ctx->encoding) {
sb->s_encoding = ctx->encoding;
- sb->s_d_op = &shmem_ci_dentry_ops;
+ set_default_d_op(sb, &shmem_ci_dentry_ops);
if (ctx->strict_encoding)
sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
}
@@ -4876,6 +5046,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#else
sb->s_flags |= SB_NOUSER;
#endif /* CONFIG_TMPFS */
+ sb->s_d_flags |= DCACHE_DONTCACHE;
sbinfo->max_blocks = ctx->blocks;
sbinfo->max_inodes = ctx->inodes;
sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
@@ -4888,7 +5059,12 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sbinfo->gid = ctx->gid;
sbinfo->full_inums = ctx->full_inums;
sbinfo->mode = ctx->mode;
- sbinfo->huge = ctx->huge;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (ctx->seen & SHMEM_SEEN_HUGE)
+ sbinfo->huge = ctx->huge;
+ else
+ sbinfo->huge = tmpfs_huge;
+#endif
sbinfo->mpol = ctx->mpol;
ctx->mpol = NULL;
@@ -4966,7 +5142,7 @@ static const struct fs_context_operations shmem_fs_context_ops = {
.free = shmem_free_fc,
.get_tree = shmem_get_tree,
#ifdef CONFIG_TMPFS
- .parse_monolithic = shmem_parse_options,
+ .parse_monolithic = shmem_parse_monolithic,
.parse_param = shmem_parse_one,
.reconfigure = shmem_reconfigure,
#endif
@@ -5024,7 +5200,6 @@ static int shmem_error_remove_folio(struct address_space *mapping,
}
static const struct address_space_operations shmem_aops = {
- .writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
@@ -5037,7 +5212,7 @@ static const struct address_space_operations shmem_aops = {
};
static const struct file_operations shmem_file_operations = {
- .mmap = shmem_mmap,
+ .mmap_prepare = shmem_mmap_prepare,
.open = shmem_file_open,
.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
@@ -5111,7 +5286,7 @@ static const struct super_operations shmem_ops = {
.get_dquots = shmem_get_dquots,
#endif
.evict_inode = shmem_evict_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.put_super = shmem_put_super,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
.nr_cached_objects = shmem_unused_huge_count,
@@ -5155,6 +5330,9 @@ int shmem_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &shmem_fs_context_ops;
+#ifdef CONFIG_TMPFS
+ fc->sb_flags |= SB_I_VERSION;
+#endif
return 0;
}
@@ -5165,7 +5343,7 @@ static struct file_system_type shmem_fs_type = {
#ifdef CONFIG_TMPFS
.parameters = shmem_fs_parameters,
#endif
- .kill_sb = kill_litter_super,
+ .kill_sb = kill_anon_super,
.fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
};
@@ -5439,6 +5617,21 @@ static int __init setup_transparent_hugepage_shmem(char *str)
}
__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);
+static int __init setup_transparent_hugepage_tmpfs(char *str)
+{
+ int huge;
+
+ huge = shmem_parse_huge(str);
+ if (huge < 0) {
+ pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
+ return huge;
+ }
+
+ tmpfs_huge = huge;
+ return 1;
+}
+__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);
+
static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_shmem(char *str)
{
@@ -5479,19 +5672,19 @@ static int __init setup_thp_shmem(char *str)
THP_ORDERS_ALL_FILE_DEFAULT);
}
- if (start == -EINVAL) {
+ if (start < 0) {
pr_err("invalid size %s in thp_shmem boot parameter\n",
start_size);
goto err;
}
- if (end == -EINVAL) {
+ if (end < 0) {
pr_err("invalid size %s in thp_shmem boot parameter\n",
end_size);
goto err;
}
- if (start < 0 || end < 0 || start > end)
+ if (start > end)
goto err;
nr = end - start + 1;
@@ -5588,11 +5781,11 @@ unsigned long shmem_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
- return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(file, addr, len, pgoff, flags);
}
#endif
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
{
truncate_inode_pages_range(inode->i_mapping, lstart, lend);
}
@@ -5617,8 +5810,10 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
/* common code */
static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
- loff_t size, unsigned long flags, unsigned int i_flags)
+ loff_t size, unsigned long vm_flags,
+ unsigned int i_flags)
{
+ unsigned long flags = (vm_flags & VM_NORESERVE) ? SHMEM_F_NORESERVE : 0;
struct inode *inode;
struct file *res;
@@ -5628,14 +5823,14 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
- if (shmem_acct_size(flags, size))
- return ERR_PTR(-ENOMEM);
-
if (is_idmapped_mnt(mnt))
return ERR_PTR(-EINVAL);
+ if (shmem_acct_size(flags, size))
+ return ERR_PTR(-ENOMEM);
+
inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
- S_IFREG | S_IRWXUGO, 0, flags);
+ S_IFREG | S_IRWXUGO, 0, vm_flags);
if (IS_ERR(inode)) {
shmem_unacct_size(flags, size);
return ERR_CAST(inode);
@@ -5658,7 +5853,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
* underlying inode. So users of this interface must do LSM checks at a
* higher layer. The users are the big_key and shm implementations. LSM
* checks are provided at the key or shm level rather than the inode.
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
@@ -5670,7 +5865,7 @@ EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
/**
* shmem_file_setup - get an unlinked file living in tmpfs
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
@@ -5683,7 +5878,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup);
/**
* shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
* @mnt: the tmpfs mount where the file will be created
- * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @name: name for dentry (to be seen in /proc/<pid>/maps)
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
*/
@@ -5694,14 +5889,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
}
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
-/**
- * shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap
- */
-int shmem_zero_setup(struct vm_area_struct *vma)
+static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags)
{
- struct file *file;
- loff_t size = vma->vm_end - vma->vm_start;
+ loff_t size = end - start;
/*
* Cloning a new file under mmap_lock leads to a lock ordering conflict
@@ -5709,7 +5899,18 @@ int shmem_zero_setup(struct vm_area_struct *vma)
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
*/
- file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
+ return shmem_kernel_file_setup("dev/zero", size, vm_flags);
+}
+
+/**
+ * shmem_zero_setup - setup a shared anonymous mapping
+ * @vma: the vma to be mmapped is prepared by do_mmap
+ * Returns: 0 on success, or error
+ */
+int shmem_zero_setup(struct vm_area_struct *vma)
+{
+ struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags);
+
if (IS_ERR(file))
return PTR_ERR(file);
@@ -5722,6 +5923,25 @@ int shmem_zero_setup(struct vm_area_struct *vma)
}
/**
+ * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA
+ * descriptor for convenience.
+ * @desc: Describes VMA
+ * Returns: 0 on success, or error
+ */
+int shmem_zero_setup_desc(struct vm_area_desc *desc)
+{
+ struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ desc->vm_file = file;
+ desc->vm_ops = &shmem_anon_vm_ops;
+
+ return 0;
+}
+
+/**
* shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
* @mapping: the folio's address_space
* @index: the folio index
@@ -5744,8 +5964,8 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
struct folio *folio;
int error;
- error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
- gfp, NULL, NULL);
+ error = shmem_get_folio_gfp(inode, index, i_size_read(inode),
+ &folio, SGP_CACHE, gfp, NULL, NULL);
if (error)
return ERR_PTR(error);