summaryrefslogtreecommitdiff
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c234
1 files changed, 96 insertions, 138 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a1b4b9d80e3b..46d2008e4b99 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -44,7 +44,7 @@
#include <linux/plist.h>
#include <asm/tlbflush.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/swap_cgroup.h>
#include "swap_table.h"
#include "internal.h"
@@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages;
EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
-static int least_priority = -1;
+#define DEF_SWAP_PRIO -1
unsigned long swapfile_maximum_size;
#ifdef CONFIG_MIGRATION
bool swap_migration_ad_supported;
@@ -103,7 +103,7 @@ static PLIST_HEAD(swap_active_head);
* is held and the locking order requires swap_lock to be taken
* before any swap_info_struct->lock.
*/
-static struct plist_head *swap_avail_heads;
+static PLIST_HEAD(swap_avail_head);
static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -236,11 +236,10 @@ again:
ret = -nr_pages;
/*
- * When this function is called from scan_swap_map_slots() and it's
- * called by vmscan.c at reclaiming folios. So we hold a folio lock
- * here. We have to use trylock for avoiding deadlock. This is a special
- * case and you should use folio_free_swap() with explicit folio_lock()
- * in usual operations.
+ * We hold a folio lock here. We have to use trylock for
+ * avoiding deadlock. This is a special case and you should
+ * use folio_free_swap() with explicit folio_lock() in usual
+ * operations.
*/
if (!folio_trylock(folio))
goto out;
@@ -594,7 +593,7 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info
* this returns NULL for an non-empty list.
*/
static struct swap_cluster_info *isolate_lock_cluster(
- struct swap_info_struct *si, struct list_head *list, int order)
+ struct swap_info_struct *si, struct list_head *list)
{
struct swap_cluster_info *ci, *found = NULL;
@@ -751,14 +750,14 @@ static void relocate_cluster(struct swap_info_struct *si,
}
/*
- * The cluster corresponding to page_nr will be used. The cluster will not be
- * added to free cluster list and its usage counter will be increased by 1.
- * Only used for initialization.
+ * The cluster corresponding to @offset will be accounted as having one bad
+ * slot. The cluster will not be added to the free cluster list, and its
+ * usage counter will be increased by 1. Only used for initialization.
*/
-static int inc_cluster_info_page(struct swap_info_struct *si,
- struct swap_cluster_info *cluster_info, unsigned long page_nr)
+static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info,
+ unsigned long offset)
{
- unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+ unsigned long idx = offset / SWAPFILE_CLUSTER;
struct swap_table *table;
struct swap_cluster_info *ci;
@@ -772,8 +771,8 @@ static int inc_cluster_info_page(struct swap_info_struct *si,
ci->count++;
- VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
- VM_BUG_ON(ci->flags);
+ WARN_ON(ci->count > SWAPFILE_CLUSTER);
+ WARN_ON(ci->flags);
return 0;
}
@@ -957,7 +956,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
unsigned int found = SWAP_ENTRY_INVALID;
do {
- struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order);
+ struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
unsigned long offset;
if (!ci)
@@ -982,7 +981,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
if (force)
to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
- while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) {
+ while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
offset = cluster_offset(si, ci);
end = min(si->max, offset + SWAPFILE_CLUSTER);
to_scan--;
@@ -1101,13 +1100,6 @@ new_cluster:
goto done;
}
- /*
- * We don't have free cluster but have some clusters in discarding,
- * do discard now and reclaim them.
- */
- if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
- goto new_cluster;
-
if (order)
goto done;
@@ -1137,7 +1129,6 @@ done:
/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
{
- int nid;
unsigned long pages;
spin_lock(&swap_avail_lock);
@@ -1166,8 +1157,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
goto skip;
}
- for_each_node(nid)
- plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
+ plist_del(&si->avail_list, &swap_avail_head);
skip:
spin_unlock(&swap_avail_lock);
@@ -1176,7 +1166,6 @@ skip:
/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
{
- int nid;
long val;
unsigned long pages;
@@ -1209,8 +1198,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
goto skip;
}
- for_each_node(nid)
- plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
+ plist_add(&si->avail_list, &swap_avail_head);
skip:
spin_unlock(&swap_avail_lock);
@@ -1350,54 +1338,79 @@ static bool swap_alloc_fast(swp_entry_t *entry,
}
/* Rotate the device and switch to a new cluster */
-static bool swap_alloc_slow(swp_entry_t *entry,
+static void swap_alloc_slow(swp_entry_t *entry,
int order)
{
- int node;
unsigned long offset;
struct swap_info_struct *si, *next;
- node = numa_node_id();
spin_lock(&swap_avail_lock);
start_over:
- plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
+ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
/* Rotate the device and switch to a new cluster */
- plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
+ plist_requeue(&si->avail_list, &swap_avail_head);
spin_unlock(&swap_avail_lock);
if (get_swap_device_info(si)) {
offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
put_swap_device(si);
if (offset) {
*entry = swp_entry(si->type, offset);
- return true;
+ return;
}
if (order)
- return false;
+ return;
}
spin_lock(&swap_avail_lock);
/*
* if we got here, it's likely that si was almost full before,
- * and since scan_swap_map_slots() can drop the si->lock,
* multiple callers probably all tried to get a page from the
* same si and it filled up before we could get one; or, the si
- * filled up between us dropping swap_avail_lock and taking
- * si->lock. Since we dropped the swap_avail_lock, the
- * swap_avail_head list may have been modified; so if next is
- * still in the swap_avail_head list then try it, otherwise
- * start over if we have not gotten any slots.
+ * filled up between us dropping swap_avail_lock.
+ * Since we dropped the swap_avail_lock, the swap_avail_list
+ * may have been modified; so if next is still in the
+ * swap_avail_head list then try it, otherwise start over if we
+ * have not gotten any slots.
*/
- if (plist_node_empty(&next->avail_lists[node]))
+ if (plist_node_empty(&next->avail_list))
goto start_over;
}
spin_unlock(&swap_avail_lock);
+}
+
+/*
+ * Discard pending clusters in a synchronized way when under high pressure.
+ * Return: true if any cluster is discarded.
+ */
+static bool swap_sync_discard(void)
+{
+ bool ret = false;
+ struct swap_info_struct *si, *next;
+
+ spin_lock(&swap_lock);
+start_over:
+ plist_for_each_entry_safe(si, next, &swap_active_head, list) {
+ spin_unlock(&swap_lock);
+ if (get_swap_device_info(si)) {
+ if (si->flags & SWP_PAGE_DISCARD)
+ ret = swap_do_scheduled_discard(si);
+ put_swap_device(si);
+ }
+ if (ret)
+ return true;
+
+ spin_lock(&swap_lock);
+ if (plist_node_empty(&next->list))
+ goto start_over;
+ }
+ spin_unlock(&swap_lock);
+
return false;
}
/**
* folio_alloc_swap - allocate swap space for a folio
* @folio: folio we want to move to swap
- * @gfp: gfp mask for shadow nodes
*
* Allocate swap space for the folio and add the folio to the
* swap cache.
@@ -1405,7 +1418,7 @@ start_over:
* Context: Caller needs to hold the folio lock.
* Return: Whether the folio was added to the swap cache.
*/
-int folio_alloc_swap(struct folio *folio, gfp_t gfp)
+int folio_alloc_swap(struct folio *folio)
{
unsigned int order = folio_order(folio);
unsigned int size = 1 << order;
@@ -1432,11 +1445,17 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
}
}
+again:
local_lock(&percpu_swap_cluster.lock);
if (!swap_alloc_fast(&entry, order))
swap_alloc_slow(&entry, order);
local_unlock(&percpu_swap_cluster.lock);
+ if (unlikely(!order && !entry.val)) {
+ if (swap_sync_discard())
+ goto again;
+ }
+
/* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
if (mem_cgroup_try_charge_swap(folio, entry))
goto out_free;
@@ -1677,7 +1696,7 @@ static bool swap_entries_put_map_nr(struct swap_info_struct *si,
/*
* Check if it's the last ref of swap entry in the freeing path.
- * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
+ * Qualified value includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
*/
static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
{
@@ -2239,7 +2258,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
struct folio *folio;
unsigned long offset;
unsigned char swp_count;
- swp_entry_t entry;
+ softleaf_t entry;
int ret;
pte_t ptent;
@@ -2250,11 +2269,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
ptent = ptep_get_lockless(pte);
+ entry = softleaf_from_pte(ptent);
- if (!is_swap_pte(ptent))
+ if (!softleaf_is_swap(entry))
continue;
-
- entry = pte_to_swp_entry(ptent);
if (swp_type(entry) != type)
continue;
@@ -2682,44 +2700,18 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
return generic_swapfile_activate(sis, swap_file, span);
}
-static int swap_node(struct swap_info_struct *si)
-{
- struct block_device *bdev;
-
- if (si->bdev)
- bdev = si->bdev;
- else
- bdev = si->swap_file->f_inode->i_sb->s_bdev;
-
- return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
-}
-
static void setup_swap_info(struct swap_info_struct *si, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info,
unsigned long *zeromap)
{
- int i;
-
- if (prio >= 0)
- si->prio = prio;
- else
- si->prio = --least_priority;
+ si->prio = prio;
/*
* the plist prio is negated because plist ordering is
* low-to-high, while swap ordering is high-to-low
*/
si->list.prio = -si->prio;
- for_each_node(i) {
- if (si->prio >= 0)
- si->avail_lists[i].prio = -si->prio;
- else {
- if (swap_node(si) == i)
- si->avail_lists[i].prio = 1;
- else
- si->avail_lists[i].prio = -si->prio;
- }
- }
+ si->avail_list.prio = -si->prio;
si->swap_map = swap_map;
si->cluster_info = cluster_info;
si->zeromap = zeromap;
@@ -2731,16 +2723,7 @@ static void _enable_swap_info(struct swap_info_struct *si)
total_swap_pages += si->pages;
assert_spin_locked(&swap_lock);
- /*
- * both lists are plists, and thus priority ordered.
- * swap_active_head needs to be priority ordered for swapoff(),
- * which on removal of any swap_info_struct with an auto-assigned
- * (i.e. negative) priority increments the auto-assigned priority
- * of any lower-priority swap_info_structs.
- * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
- * which allocates swap pages from the highest available priority
- * swap_info_struct.
- */
+
plist_add(&si->list, &swap_active_head);
/* Add back to available list */
@@ -2890,20 +2873,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
}
spin_lock(&p->lock);
del_from_avail_list(p, true);
- if (p->prio < 0) {
- struct swap_info_struct *si = p;
- int nid;
-
- plist_for_each_entry_continue(si, &swap_active_head, list) {
- si->prio++;
- si->list.prio--;
- for_each_node(nid) {
- if (si->avail_lists[nid].prio != 1)
- si->avail_lists[nid].prio--;
- }
- }
- least_priority++;
- }
plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
@@ -2942,7 +2911,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
- if (!p->bdev || !bdev_nonrot(p->bdev))
+ if (!(p->flags & SWP_SOLIDSTATE))
atomic_dec(&nr_rotate_swap);
mutex_lock(&swapon_mutex);
@@ -3141,9 +3110,8 @@ static struct swap_info_struct *alloc_swap_info(void)
struct swap_info_struct *p;
struct swap_info_struct *defer = NULL;
unsigned int type;
- int i;
- p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
+ p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -3182,8 +3150,7 @@ static struct swap_info_struct *alloc_swap_info(void)
}
p->swap_extent_root = RB_ROOT;
plist_node_init(&p->list, 0);
- for_each_node(i)
- plist_node_init(&p->avail_lists[i], 0);
+ plist_node_init(&p->avail_list, 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
if (defer) {
@@ -3236,8 +3203,17 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
*/
unsigned long generic_max_swapfile_size(void)
{
- return swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+ swp_entry_t entry = swp_entry(0, ~0UL);
+ const pte_t pte = softleaf_to_pte(entry);
+
+ /*
+ * Since the PTE can be an invalid softleaf entry (e.g. the none PTE),
+ * we need to do this manually.
+ */
+ entry = __pte_to_swp_entry(pte);
+ entry = swp_entry(__swp_type(entry), __swp_offset(entry));
+
+ return swp_offset(entry) + 1;
}
/* Can be overridden by an architecture for additional checks. */
@@ -3355,7 +3331,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
si->global_cluster = kmalloc(sizeof(*si->global_cluster),
GFP_KERNEL);
if (!si->global_cluster)
- goto err_free;
+ goto err;
for (i = 0; i < SWAP_NR_ORDERS; i++)
si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
spin_lock_init(&si->global_cluster_lock);
@@ -3368,7 +3344,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
* See setup_swap_map(): header page, bad pages,
* and the EOF part of the last cluster.
*/
- err = inc_cluster_info_page(si, cluster_info, 0);
+ err = swap_cluster_setup_bad_slot(cluster_info, 0);
if (err)
goto err;
for (i = 0; i < swap_header->info.nr_badpages; i++) {
@@ -3376,12 +3352,12 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
if (page_nr >= maxpages)
continue;
- err = inc_cluster_info_page(si, cluster_info, page_nr);
+ err = swap_cluster_setup_bad_slot(cluster_info, page_nr);
if (err)
goto err;
}
for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
- err = inc_cluster_info_page(si, cluster_info, i);
+ err = swap_cluster_setup_bad_slot(cluster_info, i);
if (err)
goto err;
}
@@ -3408,9 +3384,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
}
return cluster_info;
-err_free:
- free_cluster_info(cluster_info, maxpages);
err:
+ free_cluster_info(cluster_info, maxpages);
return ERR_PTR(err);
}
@@ -3440,9 +3415,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!swap_avail_heads)
- return -ENOMEM;
-
si = alloc_swap_info();
if (IS_ERR(si))
return PTR_ERR(si);
@@ -3619,7 +3591,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
mutex_lock(&swapon_mutex);
- prio = -1;
+ prio = DEF_SWAP_PRIO;
if (swap_flags & SWAP_FLAG_PREFER)
prio = swap_flags & SWAP_FLAG_PRIO_MASK;
enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
@@ -4051,8 +4023,7 @@ static bool __has_usable_swap(void)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
- struct swap_info_struct *si, *next;
- int nid = folio_nid(folio);
+ struct swap_info_struct *si;
if (!(gfp & __GFP_IO))
return;
@@ -4071,8 +4042,7 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
return;
spin_lock(&swap_avail_lock);
- plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
- avail_lists[nid]) {
+ plist_for_each_entry(si, &swap_avail_head, avail_list) {
if (si->bdev) {
blkcg_schedule_throttle(si->bdev->bd_disk, true);
break;
@@ -4084,18 +4054,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
static int __init swapfile_init(void)
{
- int nid;
-
- swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
- GFP_KERNEL);
- if (!swap_avail_heads) {
- pr_emerg("Not enough memory for swap heads, swap is disabled\n");
- return -ENOMEM;
- }
-
- for_each_node(nid)
- plist_head_init(&swap_avail_heads[nid]);
-
swapfile_maximum_size = arch_max_swapfile_size();
/*