summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2019-04-18 17:50:13 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-04-19 09:46:04 -0700
commitaf53d3e9e04024885de5b4fda51e5fa362ae2bd8 (patch)
treeab2a56b0bafc76d99dcd2924f22f0fed76f4ae2e /mm
parent64165b1affc5bc16231ac971e66aae7d68d57f2c (diff)
mm: swapoff: shmem_unuse() stop eviction without igrab()
The igrab() in shmem_unuse() looks good, but we forgot that it gives no protection against concurrent unmounting: a point made by Konstantin Khlebnikov eight years ago, and then fixed in 2.6.39 by 778dd893ae78 ("tmpfs: fix race between umount and swapoff"). The current 5.1-rc swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs. Self-destruct in 5 seconds. Have a nice day..." followed by GPF. Once again, give up on using igrab(); but don't go back to making such heavy-handed use of shmem_swaplist_mutex as last time: that would spoil the new design, and I expect could deadlock inside shmem_swapin_page(). Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem- specific inode, and shmem_evict_inode() wait for that to go down to 0. Call it "stop_eviction" rather than "swapoff_busy" because it can be put to use for others later (huge tmpfs patches expect to use it). That simplifies shmem_unuse(), protecting it from both unlink and unmount; and in practice lets it locate all the swap in its first try. But do not rely on that: there's still a theoretical case, when shmem_writepage() might have been preempted after its get_swap_page(), before making the swap entry visible to swapoff. [hughd@google.com: remove incorrect list_del()] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904091133570.1898@eggly.anvils Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904081259400.1523@eggly.anvils Fixes: b56a2d8af914 ("mm: rid swapoff of quadratic complexity") Signed-off-by: Hugh Dickins <hughd@google.com> Cc: "Alex Xu (Hello71)" <alex_y_xu@yahoo.ca> Cc: Huang Ying <ying.huang@intel.com> Cc: Kelley Nielsen <kelleynnn@gmail.com> Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru> Cc: Rik van Riel <riel@surriel.com> Cc: Vineeth Pillai <vpillai@digitalocean.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/shmem.c40
-rw-r--r--mm/swapfile.c11
2 files changed, 23 insertions, 28 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index 859e8628071f..2275a0ff7c30 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1081,9 +1081,14 @@ static void shmem_evict_inode(struct inode *inode)
}
spin_unlock(&sbinfo->shrinklist_lock);
}
- if (!list_empty(&info->swaplist)) {
+ while (!list_empty(&info->swaplist)) {
+ /* Wait while shmem_unuse() is scanning this inode... */
+ wait_var_event(&info->stop_eviction,
+ !atomic_read(&info->stop_eviction));
mutex_lock(&shmem_swaplist_mutex);
- list_del_init(&info->swaplist);
+ /* ...but beware of the race if we peeked too early */
+ if (!atomic_read(&info->stop_eviction))
+ list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
}
}
@@ -1227,36 +1232,27 @@ int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
{
struct shmem_inode_info *info, *next;
- struct inode *inode;
- struct inode *prev_inode = NULL;
int error = 0;
if (list_empty(&shmem_swaplist))
return 0;
mutex_lock(&shmem_swaplist_mutex);
-
- /*
- * The extra refcount on the inode is necessary to safely dereference
- * p->next after re-acquiring the lock. New shmem inodes with swap
- * get added to the end of the list and we will scan them all.
- */
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) {
list_del_init(&info->swaplist);
continue;
}
-
- inode = igrab(&info->vfs_inode);
- if (!inode)
- continue;
-
+ /*
+ * Drop the swaplist mutex while searching the inode for swap;
+ * but before doing so, make sure shmem_evict_inode() will not
+ * remove placeholder inode from swaplist, nor let it be freed
+ * (igrab() would protect from unlink, but not from unmount).
+ */
+ atomic_inc(&info->stop_eviction);
mutex_unlock(&shmem_swaplist_mutex);
- if (prev_inode)
- iput(prev_inode);
- prev_inode = inode;
- error = shmem_unuse_inode(inode, type, frontswap,
+ error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
fs_pages_to_unuse);
cond_resched();
@@ -1264,14 +1260,13 @@ int shmem_unuse(unsigned int type, bool frontswap,
next = list_next_entry(info, swaplist);
if (!info->swapped)
list_del_init(&info->swaplist);
+ if (atomic_dec_and_test(&info->stop_eviction))
+ wake_up_var(&info->stop_eviction);
if (error)
break;
}
mutex_unlock(&shmem_swaplist_mutex);
- if (prev_inode)
- iput(prev_inode);
-
return error;
}
@@ -2238,6 +2233,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
+ atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->shrinklist);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 71383625a582..cf63b5f01adf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2116,12 +2116,11 @@ retry:
* Under global memory pressure, swap entries can be reinserted back
* into process space after the mmlist loop above passes over them.
*
- * Limit the number of retries? No: when shmem_unuse()'s igrab() fails,
- * a shmem inode using swap is being evicted; and when mmget_not_zero()
- * above fails, that mm is likely to be freeing swap from exit_mmap().
- * Both proceed at their own independent pace: we could move them to
- * separate lists, and wait for those lists to be emptied; but it's
- * easier and more robust (though cpu-intensive) just to keep retrying.
+ * Limit the number of retries? No: when mmget_not_zero() above fails,
+ * that mm is likely to be freeing swap from exit_mmap(), which proceeds
+ * at its own independent pace; and even shmem_writepage() could have
+ * been preempted after get_swap_page(), temporarily hiding that swap.
+ * It's easy and robust (though cpu-intensive) just to keep retrying.
*/
if (si->inuse_pages) {
if (!signal_pending(current))