summaryrefslogtreecommitdiff
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c566
1 files changed, 363 insertions, 203 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b4f3cc712580..10760240a3a2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -46,6 +46,7 @@
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
+#include "swap_table.h"
#include "internal.h"
#include "swap.h"
@@ -58,9 +59,9 @@ static void swap_entries_free(struct swap_info_struct *si,
static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
-static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
- unsigned long offset);
-static inline void unlock_cluster(struct swap_cluster_info *ci);
+static void move_cluster(struct swap_info_struct *si,
+ struct swap_cluster_info *ci, struct list_head *list,
+ enum swap_cluster_flags new_flags);
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -105,7 +106,9 @@ static PLIST_HEAD(swap_active_head);
static struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
+
+static struct kmem_cache *swap_table_cachep;
static DEFINE_MUTEX(swapon_mutex);
@@ -127,14 +130,20 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
.lock = INIT_LOCAL_LOCK(),
};
-static struct swap_info_struct *swap_type_to_swap_info(int type)
+/* May return NULL on invalid type, caller must check for NULL return */
+static struct swap_info_struct *swap_type_to_info(int type)
{
if (type >= MAX_SWAPFILES)
return NULL;
-
return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}
+/* May return NULL on invalid entry, caller must check for NULL return */
+static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry)
+{
+ return swap_type_to_info(swp_type(entry));
+}
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
@@ -212,16 +221,15 @@ static bool swap_is_last_map(struct swap_info_struct *si,
static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
- swp_entry_t entry = swp_entry(si->type, offset);
- struct address_space *address_space = swap_address_space(entry);
+ const swp_entry_t entry = swp_entry(si->type, offset);
struct swap_cluster_info *ci;
struct folio *folio;
int ret, nr_pages;
bool need_reclaim;
again:
- folio = filemap_get_folio(address_space, swap_cache_index(entry));
- if (IS_ERR(folio))
+ folio = swap_cache_get_folio(entry);
+ if (!folio)
return 0;
nr_pages = folio_nr_pages(folio);
@@ -241,13 +249,12 @@ again:
* Offset could point to the middle of a large folio, or folio
* may no longer point to the expected offset before it's locked.
*/
- entry = folio->swap;
- if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
+ if (!folio_matches_swap_entry(folio, entry)) {
folio_unlock(folio);
folio_put(folio);
goto again;
}
- offset = swp_offset(entry);
+ offset = swp_offset(folio->swap);
need_reclaim = ((flags & TTRS_ANYWAY) ||
((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
@@ -260,13 +267,13 @@ again:
* swap_map is HAS_CACHE only, which means the slots have no page table
* reference or pending writeback, and can't be allocated to others.
*/
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
need_reclaim = swap_only_has_cache(si, offset, nr_pages);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
if (!need_reclaim)
goto out_unlock;
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
folio_set_dirty(folio);
ret = nr_pages;
out_unlock:
@@ -347,7 +354,7 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
sector_t swap_folio_sector(struct folio *folio)
{
- struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ struct swap_info_struct *sis = __swap_entry_to_info(folio->swap);
struct swap_extent *se;
sector_t sector;
pgoff_t offset;
@@ -387,19 +394,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
}
}
-#ifdef CONFIG_THP_SWAP
-#define SWAPFILE_CLUSTER HPAGE_PMD_NR
-
-#define swap_entry_order(order) (order)
-#else
-#define SWAPFILE_CLUSTER 256
-
-/*
- * Define swap_entry_order() as constant to let compiler to optimize
- * out some code if !CONFIG_THP_SWAP
- */
-#define swap_entry_order(order) 0
-#endif
#define LATENCY_LIMIT 256
static inline bool cluster_is_empty(struct swap_cluster_info *info)
@@ -412,10 +406,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info)
return info->flags == CLUSTER_FLAG_DISCARD;
}
+static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci)
+{
+ return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock));
+}
+
static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
{
if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
return false;
+ if (!cluster_table_is_alloced(ci))
+ return false;
if (!order)
return true;
return cluster_is_empty(ci) || order == ci->order;
@@ -427,32 +428,126 @@ static inline unsigned int cluster_index(struct swap_info_struct *si,
return ci - si->cluster_info;
}
-static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si,
- unsigned long offset)
-{
- return &si->cluster_info[offset / SWAPFILE_CLUSTER];
-}
-
static inline unsigned int cluster_offset(struct swap_info_struct *si,
struct swap_cluster_info *ci)
{
return cluster_index(si, ci) * SWAPFILE_CLUSTER;
}
-static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
- unsigned long offset)
+static struct swap_table *swap_table_alloc(gfp_t gfp)
{
- struct swap_cluster_info *ci;
+ struct folio *folio;
- ci = offset_to_cluster(si, offset);
- spin_lock(&ci->lock);
+ if (!SWP_TABLE_USE_PAGE)
+ return kmem_cache_zalloc(swap_table_cachep, gfp);
- return ci;
+ folio = folio_alloc(gfp | __GFP_ZERO, 0);
+ if (folio)
+ return folio_address(folio);
+ return NULL;
+}
+
+static void swap_table_free_folio_rcu_cb(struct rcu_head *head)
+{
+ struct folio *folio;
+
+ folio = page_folio(container_of(head, struct page, rcu_head));
+ folio_put(folio);
+}
+
+static void swap_table_free(struct swap_table *table)
+{
+ if (!SWP_TABLE_USE_PAGE) {
+ kmem_cache_free(swap_table_cachep, table);
+ return;
+ }
+
+ call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head),
+ swap_table_free_folio_rcu_cb);
+}
+
+static void swap_cluster_free_table(struct swap_cluster_info *ci)
+{
+ unsigned int ci_off;
+ struct swap_table *table;
+
+ /* Only empty cluster's table is allow to be freed */
+ lockdep_assert_held(&ci->lock);
+ VM_WARN_ON_ONCE(!cluster_is_empty(ci));
+ for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++)
+ VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off)));
+ table = (void *)rcu_dereference_protected(ci->table, true);
+ rcu_assign_pointer(ci->table, NULL);
+
+ swap_table_free(table);
}
-static inline void unlock_cluster(struct swap_cluster_info *ci)
+/*
+ * Allocate swap table for one cluster. Attempt an atomic allocation first,
+ * then fallback to sleeping allocation.
+ */
+static struct swap_cluster_info *
+swap_cluster_alloc_table(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
{
+ struct swap_table *table;
+
+ /*
+ * Only cluster isolation from the allocator does table allocation.
+ * Swap allocator uses percpu clusters and holds the local lock.
+ */
+ lockdep_assert_held(&ci->lock);
+ lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
+
+ /* The cluster must be free and was just isolated from the free list. */
+ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci));
+
+ table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ if (table) {
+ rcu_assign_pointer(ci->table, table);
+ return ci;
+ }
+
+ /*
+ * Try a sleep allocation. Each isolated free cluster may cause
+ * a sleep allocation, but there is a limited number of them, so
+ * the potential recursive allocation is limited.
+ */
spin_unlock(&ci->lock);
+ if (!(si->flags & SWP_SOLIDSTATE))
+ spin_unlock(&si->global_cluster_lock);
+ local_unlock(&percpu_swap_cluster.lock);
+
+ table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL);
+
+ /*
+ * Back to atomic context. We might have migrated to a new CPU with a
+ * usable percpu cluster. But just keep using the isolated cluster to
+ * make things easier. Migration indicates a slight change of workload
+ * so using a new free cluster might not be a bad idea, and the worst
+ * could happen with ignoring the percpu cluster is fragmentation,
+ * which is acceptable since this fallback and race is rare.
+ */
+ local_lock(&percpu_swap_cluster.lock);
+ if (!(si->flags & SWP_SOLIDSTATE))
+ spin_lock(&si->global_cluster_lock);
+ spin_lock(&ci->lock);
+
+ /* Nothing except this helper should touch a dangling empty cluster. */
+ if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) {
+ if (table)
+ swap_table_free(table);
+ return ci;
+ }
+
+ if (!table) {
+ move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
+ spin_unlock(&ci->lock);
+ return NULL;
+ }
+
+ rcu_assign_pointer(ci->table, table);
+ return ci;
}
static void move_cluster(struct swap_info_struct *si,
@@ -470,11 +565,6 @@ static void move_cluster(struct swap_info_struct *si,
else
list_move_tail(&ci->list, list);
spin_unlock(&si->lock);
-
- if (ci->flags == CLUSTER_FLAG_FRAG)
- atomic_long_dec(&si->frag_cluster_nr[ci->order]);
- else if (new_flags == CLUSTER_FLAG_FRAG)
- atomic_long_inc(&si->frag_cluster_nr[ci->order]);
ci->flags = new_flags;
}
@@ -489,7 +579,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
- lockdep_assert_held(&ci->lock);
+ swap_cluster_free_table(ci);
move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
ci->order = 0;
}
@@ -504,15 +594,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info
* this returns NULL for an non-empty list.
*/
static struct swap_cluster_info *isolate_lock_cluster(
- struct swap_info_struct *si, struct list_head *list)
+ struct swap_info_struct *si, struct list_head *list, int order)
{
- struct swap_cluster_info *ci, *ret = NULL;
+ struct swap_cluster_info *ci, *found = NULL;
spin_lock(&si->lock);
-
- if (unlikely(!(si->flags & SWP_WRITEOK)))
- goto out;
-
list_for_each_entry(ci, list, list) {
if (!spin_trylock(&ci->lock))
continue;
@@ -524,13 +610,19 @@ static struct swap_cluster_info *isolate_lock_cluster(
list_del(&ci->list);
ci->flags = CLUSTER_FLAG_NONE;
- ret = ci;
+ found = ci;
break;
}
-out:
spin_unlock(&si->lock);
- return ret;
+ if (found && !cluster_table_is_alloced(found)) {
+ /* Only an empty free cluster's swap table can be freed. */
+ VM_WARN_ON_ONCE(list != &si->free_clusters);
+ VM_WARN_ON_ONCE(!cluster_is_empty(found));
+ return swap_cluster_alloc_table(si, found);
+ }
+
+ return found;
}
/*
@@ -663,17 +755,27 @@ static void relocate_cluster(struct swap_info_struct *si,
* added to free cluster list and its usage counter will be increased by 1.
* Only used for initialization.
*/
-static void inc_cluster_info_page(struct swap_info_struct *si,
+static int inc_cluster_info_page(struct swap_info_struct *si,
struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+ struct swap_table *table;
struct swap_cluster_info *ci;
ci = cluster_info + idx;
+ if (!ci->table) {
+ table = swap_table_alloc(GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+ rcu_assign_pointer(ci->table, table);
+ }
+
ci->count++;
VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
VM_BUG_ON(ci->flags);
+
+ return 0;
}
static bool cluster_reclaim_range(struct swap_info_struct *si,
@@ -742,6 +844,26 @@ static bool cluster_scan_range(struct swap_info_struct *si,
return true;
}
+/*
+ * Currently, the swap table is not used for count tracking, just
+ * do a sanity check here to ensure nothing leaked, so the swap
+ * table should be empty upon freeing.
+ */
+static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci,
+ unsigned int start, unsigned int nr)
+{
+ unsigned int ci_off = start % SWAPFILE_CLUSTER;
+ unsigned int ci_end = ci_off + nr;
+ unsigned long swp_tb;
+
+ if (IS_ENABLED(CONFIG_DEBUG_VM)) {
+ do {
+ swp_tb = __swap_table_get(ci, ci_off);
+ VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+ } while (++ci_off < ci_end);
+ }
+}
+
static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
unsigned int start, unsigned char usage,
unsigned int order)
@@ -761,6 +883,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
ci->order = order;
memset(si->swap_map + start, usage, nr_pages);
+ swap_cluster_assert_table_empty(ci, start, nr_pages);
swap_range_alloc(si, nr_pages);
ci->count += nr_pages;
@@ -815,7 +938,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
}
out:
relocate_cluster(si, ci);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
if (si->flags & SWP_SOLIDSTATE) {
this_cpu_write(percpu_swap_cluster.offset[order], next);
this_cpu_write(percpu_swap_cluster.si[order], si);
@@ -825,6 +948,29 @@ out:
return found;
}
+static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
+ struct list_head *list,
+ unsigned int order,
+ unsigned char usage,
+ bool scan_all)
+{
+ unsigned int found = SWAP_ENTRY_INVALID;
+
+ do {
+ struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order);
+ unsigned long offset;
+
+ if (!ci)
+ break;
+ offset = cluster_offset(si, ci);
+ found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
+ if (found)
+ break;
+ } while (scan_all);
+
+ return found;
+}
+
static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
long to_scan = 1;
@@ -836,7 +982,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
if (force)
to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
- while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
+ while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) {
offset = cluster_offset(si, ci);
end = min(si->max, offset + SWAPFILE_CLUSTER);
to_scan--;
@@ -859,7 +1005,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
if (ci->flags == CLUSTER_FLAG_NONE)
relocate_cluster(si, ci);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
if (to_scan <= 0)
break;
}
@@ -898,7 +1044,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
if (offset == SWAP_ENTRY_INVALID)
goto new_cluster;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
/* Cluster could have been used by another order */
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
@@ -906,53 +1052,53 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
found = alloc_swap_scan_cluster(si, ci, offset,
order, usage);
} else {
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
}
if (found)
goto done;
}
new_cluster:
- ci = isolate_lock_cluster(si, &si->free_clusters);
- if (ci) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
+ /*
+ * If the device need discard, prefer new cluster over nonfull
+ * to spread out the writes.
+ */
+ if (si->flags & SWP_PAGE_DISCARD) {
+ found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
+ false);
+ if (found)
+ goto done;
+ }
+
+ if (order < PMD_ORDER) {
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[order],
+ order, usage, true);
+ if (found)
+ goto done;
+ }
+
+ if (!(si->flags & SWP_PAGE_DISCARD)) {
+ found = alloc_swap_scan_list(si, &si->free_clusters, order, usage,
+ false);
if (found)
goto done;
}
- /* Try reclaim from full clusters if free clusters list is drained */
+ /* Try reclaim full clusters if free and nonfull lists are drained */
if (vm_swap_full())
swap_reclaim_full_clusters(si, false);
if (order < PMD_ORDER) {
- unsigned int frags = 0, frags_existing;
-
- while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
- if (found)
- goto done;
- /* Clusters failed to allocate are moved to frag_clusters */
- frags++;
- }
-
- frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
- while (frags < frags_existing &&
- (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
- atomic_long_dec(&si->frag_cluster_nr[order]);
- /*
- * Rotate the frag list to iterate, they were all
- * failing high order allocation or moved here due to
- * per-CPU usage, but they could contain newly released
- * reclaimable (eg. lazy-freed swap cache) slots.
- */
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- order, usage);
- if (found)
- goto done;
- frags++;
- }
+ /*
+ * Scan only one fragment cluster is good enough. Order 0
+ * allocation will surely success, and large allocation
+ * failure is not critical. Scanning one cluster still
+ * keeps the list rotated and reclaimed (for HAS_CACHE).
+ */
+ found = alloc_swap_scan_list(si, &si->frag_clusters[order], order,
+ usage, false);
+ if (found)
+ goto done;
}
/*
@@ -971,24 +1117,20 @@ new_cluster:
* Clusters here have at least one usable slots and can't fail order 0
* allocation, but reclaim may drop si->lock and race with another user.
*/
- while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
- atomic_long_dec(&si->frag_cluster_nr[o]);
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
- if (found)
- goto done;
- }
+ found = alloc_swap_scan_list(si, &si->frag_clusters[o],
+ 0, usage, true);
+ if (found)
+ goto done;
- while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
- found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
- 0, usage);
- if (found)
- goto done;
- }
+ found = alloc_swap_scan_list(si, &si->nonfull_clusters[o],
+ 0, usage, true);
+ if (found)
+ goto done;
}
done:
if (!(si->flags & SWP_SOLIDSTATE))
spin_unlock(&si->global_cluster_lock);
+
return found;
}
@@ -1145,7 +1287,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify(si->bdev, offset);
offset++;
}
- clear_shadow_from_swap_cache(si->type, begin, end);
+ __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries);
/*
* Make sure that try_to_unuse() observes si->inuse_pages reaching 0
@@ -1192,7 +1334,7 @@ static bool swap_alloc_fast(swp_entry_t *entry,
if (!si || !offset || !get_swap_device_info(si))
return false;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
if (cluster_is_usable(ci, order)) {
if (cluster_is_empty(ci))
offset = cluster_offset(si, ci);
@@ -1200,7 +1342,7 @@ static bool swap_alloc_fast(swp_entry_t *entry,
if (found)
*entry = swp_entry(si->type, found);
} else {
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
}
put_swap_device(si);
@@ -1302,16 +1444,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
if (!entry.val)
return -ENOMEM;
- /*
- * XArray node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
- goto out_free;
+ swap_cache_add_folio(folio, entry, NULL);
return 0;
@@ -1327,7 +1460,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
if (!entry.val)
goto out;
- si = swp_swap_info(entry);
+ si = swap_entry_to_info(entry);
if (!si)
goto bad_nofile;
if (data_race(!(si->flags & SWP_USED)))
@@ -1442,7 +1575,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
if (!entry.val)
goto out;
- si = swp_swap_info(entry);
+ si = swap_entry_to_info(entry);
if (!si)
goto bad_nofile;
if (!get_swap_device_info(si))
@@ -1468,14 +1601,14 @@ static void swap_entries_put_cache(struct swap_info_struct *si,
unsigned long offset = swp_offset(entry);
struct swap_cluster_info *ci;
- ci = lock_cluster(si, offset);
- if (swap_only_has_cache(si, offset, nr))
+ ci = swap_cluster_lock(si, offset);
+ if (swap_only_has_cache(si, offset, nr)) {
swap_entries_free(si, ci, entry, nr);
- else {
+ } else {
for (int i = 0; i < nr; i++, entry.val++)
swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
}
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
}
static bool swap_entries_put_map(struct swap_info_struct *si,
@@ -1493,7 +1626,7 @@ static bool swap_entries_put_map(struct swap_info_struct *si,
if (count != 1 && count != SWAP_MAP_SHMEM)
goto fallback;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
if (!swap_is_last_map(si, offset, nr, &has_cache)) {
goto locked_fallback;
}
@@ -1502,21 +1635,20 @@ static bool swap_entries_put_map(struct swap_info_struct *si,
else
for (i = 0; i < nr; i++)
WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return has_cache;
fallback:
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
locked_fallback:
for (i = 0; i < nr; i++, entry.val++) {
count = swap_entry_put_locked(si, ci, entry, 1);
if (count == SWAP_HAS_CACHE)
has_cache = true;
}
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return has_cache;
-
}
/*
@@ -1566,7 +1698,7 @@ static void swap_entries_free(struct swap_info_struct *si,
unsigned char *map_end = map + nr_pages;
/* It should never free entries across different clusters */
- VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1));
+ VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1));
VM_BUG_ON(cluster_is_empty(ci));
VM_BUG_ON(ci->count < nr_pages);
@@ -1578,6 +1710,7 @@ static void swap_entries_free(struct swap_info_struct *si,
mem_cgroup_uncharge_swap(entry, nr_pages);
swap_range_free(si, offset, nr_pages);
+ swap_cluster_assert_table_empty(ci, offset, nr_pages);
if (!ci->count)
free_cluster(si, ci);
@@ -1624,7 +1757,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
int __swap_count(swp_entry_t entry)
{
- struct swap_info_struct *si = swp_swap_info(entry);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
pgoff_t offset = swp_offset(entry);
return swap_count(si->swap_map[offset]);
@@ -1641,9 +1774,9 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
struct swap_cluster_info *ci;
int count;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
count = swap_count(si->swap_map[offset]);
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return !!count;
}
@@ -1666,7 +1799,7 @@ int swp_swapcount(swp_entry_t entry)
offset = swp_offset(entry);
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
count = swap_count(si->swap_map[offset]);
if (!(count & COUNT_CONTINUED))
@@ -1689,7 +1822,7 @@ int swp_swapcount(swp_entry_t entry)
n *= (SWAP_CONT_MAX + 1);
} while (tmp_count & COUNT_CONTINUED);
out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return count;
}
@@ -1704,7 +1837,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
int i;
bool ret = false;
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
if (nr_pages == 1) {
if (swap_count(map[roffset]))
ret = true;
@@ -1717,7 +1850,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
}
}
unlock_out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return ret;
}
@@ -1781,7 +1914,7 @@ bool folio_free_swap(struct folio *folio)
if (folio_swapped(folio))
return false;
- delete_from_swap_cache(folio);
+ swap_cache_del_folio(folio);
folio_set_dirty(folio);
return true;
}
@@ -1855,7 +1988,7 @@ out:
swp_entry_t get_swap_page_of_type(int type)
{
- struct swap_info_struct *si = swap_type_to_swap_info(type);
+ struct swap_info_struct *si = swap_type_to_info(type);
unsigned long offset;
swp_entry_t entry = {0};
@@ -1865,7 +1998,13 @@ swp_entry_t get_swap_page_of_type(int type)
/* This is called for allocating swap entry, not cache */
if (get_swap_device_info(si)) {
if (si->flags & SWP_WRITEOK) {
+ /*
+ * Grab the local lock to be complaint
+ * with swap table allocation.
+ */
+ local_lock(&percpu_swap_cluster.lock);
offset = cluster_alloc_swap_entry(si, 0, 1);
+ local_unlock(&percpu_swap_cluster.lock);
if (offset) {
entry = swp_entry(si->type, offset);
atomic_long_dec(&nr_swap_pages);
@@ -1936,7 +2075,7 @@ int find_first_swap(dev_t *device)
*/
sector_t swapdev_block(int type, pgoff_t offset)
{
- struct swap_info_struct *si = swap_type_to_swap_info(type);
+ struct swap_info_struct *si = swap_type_to_info(type);
struct swap_extent *se;
if (!si || !(si->flags & SWP_WRITEOK))
@@ -1992,6 +2131,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
bool hwpoisoned = false;
int ret = 1;
+ /*
+ * If the folio is removed from swap cache by others, continue to
+ * unuse other PTEs. try_to_unuse may try again if we missed this one.
+ */
+ if (!folio_matches_swap_entry(folio, entry))
+ return 0;
+
swapcache = folio;
folio = ksm_might_need_to_copy(folio, vma, addr);
if (unlikely(!folio))
@@ -2118,7 +2264,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_unmap(pte);
pte = NULL;
- folio = swap_cache_get_folio(entry, vma, addr);
+ folio = swap_cache_get_folio(entry);
if (!folio) {
struct vm_fault vmf = {
.vma = vma,
@@ -2243,6 +2389,8 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
+ if (check_stable_address_space(mm))
+ goto unlock;
for_each_vma(vmi, vma) {
if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
ret = unuse_vma(vma, type);
@@ -2252,6 +2400,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
cond_resched();
}
+unlock:
mmap_read_unlock(mm);
return ret;
}
@@ -2344,8 +2493,8 @@ retry:
(i = find_next_to_unuse(si, i)) != 0) {
entry = swp_entry(type, i);
- folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
- if (IS_ERR(folio))
+ folio = swap_cache_get_folio(entry);
+ if (!folio)
continue;
/*
@@ -2644,9 +2793,30 @@ static void wait_for_allocation(struct swap_info_struct *si)
BUG_ON(si->flags & SWP_WRITEOK);
for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
- ci = lock_cluster(si, offset);
- unlock_cluster(ci);
+ ci = swap_cluster_lock(si, offset);
+ swap_cluster_unlock(ci);
+ }
+}
+
+static void free_cluster_info(struct swap_cluster_info *cluster_info,
+ unsigned long maxpages)
+{
+ struct swap_cluster_info *ci;
+ int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+
+ if (!cluster_info)
+ return;
+ for (i = 0; i < nr_clusters; i++) {
+ ci = cluster_info + i;
+ /* Cluster with bad marks count will have a remaining table */
+ spin_lock(&ci->lock);
+ if (rcu_dereference_protected(ci->table, true)) {
+ ci->count = 0;
+ swap_cluster_free_table(ci);
+ }
+ spin_unlock(&ci->lock);
}
+ kvfree(cluster_info);
}
/*
@@ -2681,6 +2851,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
+ unsigned int maxpages;
int err, found = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -2783,12 +2954,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
swap_file = p->swap_file;
p->swap_file = NULL;
- p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
zeromap = p->zeromap;
p->zeromap = NULL;
+ maxpages = p->max;
cluster_info = p->cluster_info;
+ p->max = 0;
p->cluster_info = NULL;
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
@@ -2799,10 +2971,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->global_cluster = NULL;
vfree(swap_map);
kvfree(zeromap);
- kvfree(cluster_info);
+ free_cluster_info(cluster_info, maxpages);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
- exit_swap_address_space(p->type);
inode = mapping->host;
@@ -2858,7 +3029,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
if (!l)
return SEQ_START_TOKEN;
- for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
+ for (type = 0; (si = swap_type_to_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
if (!--l)
@@ -2879,7 +3050,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
type = si->type + 1;
++(*pos);
- for (; (si = swap_type_to_swap_info(type)); type++) {
+ for (; (si = swap_type_to_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
return si;
@@ -3166,21 +3337,14 @@ static int setup_swap_map(struct swap_info_struct *si,
return 0;
}
-#define SWAP_CLUSTER_INFO_COLS \
- DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
-#define SWAP_CLUSTER_SPACE_COLS \
- DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
-#define SWAP_CLUSTER_COLS \
- max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
-
static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
union swap_header *swap_header,
unsigned long maxpages)
{
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
struct swap_cluster_info *cluster_info;
- unsigned long i, j, idx;
int err = -ENOMEM;
+ unsigned long i;
cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
if (!cluster_info)
@@ -3206,16 +3370,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
* See setup_swap_map(): header page, bad pages,
* and the EOF part of the last cluster.
*/
- inc_cluster_info_page(si, cluster_info, 0);
+ err = inc_cluster_info_page(si, cluster_info, 0);
+ if (err)
+ goto err;
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr >= maxpages)
continue;
- inc_cluster_info_page(si, cluster_info, page_nr);
+ err = inc_cluster_info_page(si, cluster_info, page_nr);
+ if (err)
+ goto err;
+ }
+ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
+ err = inc_cluster_info_page(si, cluster_info, i);
+ if (err)
+ goto err;
}
- for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
- inc_cluster_info_page(si, cluster_info, i);
INIT_LIST_HEAD(&si->free_clusters);
INIT_LIST_HEAD(&si->full_clusters);
@@ -3224,34 +3395,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
for (i = 0; i < SWAP_NR_ORDERS; i++) {
INIT_LIST_HEAD(&si->nonfull_clusters[i]);
INIT_LIST_HEAD(&si->frag_clusters[i]);
- atomic_long_set(&si->frag_cluster_nr[i], 0);
}
- /*
- * Reduce false cache line sharing between cluster_info and
- * sharing same address space.
- */
- for (j = 0; j < SWAP_CLUSTER_COLS; j++) {
- for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
- struct swap_cluster_info *ci;
- idx = i * SWAP_CLUSTER_COLS + j;
- ci = cluster_info + idx;
- if (idx >= nr_clusters)
- continue;
- if (ci->count) {
- ci->flags = CLUSTER_FLAG_NONFULL;
- list_add_tail(&ci->list, &si->nonfull_clusters[0]);
- continue;
- }
+ for (i = 0; i < nr_clusters; i++) {
+ struct swap_cluster_info *ci = &cluster_info[i];
+
+ if (ci->count) {
+ ci->flags = CLUSTER_FLAG_NONFULL;
+ list_add_tail(&ci->list, &si->nonfull_clusters[0]);
+ } else {
ci->flags = CLUSTER_FLAG_FREE;
list_add_tail(&ci->list, &si->free_clusters);
}
}
return cluster_info;
-
err_free:
- kvfree(cluster_info);
+ free_cluster_info(cluster_info, maxpages);
err:
return ERR_PTR(err);
}
@@ -3445,13 +3605,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
}
- error = init_swap_address_space(si->type, maxpages);
- if (error)
- goto bad_swap_unlock_inode;
-
error = zswap_swapon(si->type, maxpages);
if (error)
- goto free_swap_address_space;
+ goto bad_swap_unlock_inode;
/*
* Flush any pending IO and dirty mappings before we start using this
@@ -3486,8 +3642,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto out;
free_swap_zswap:
zswap_swapoff(si->type);
-free_swap_address_space:
- exit_swap_address_space(si->type);
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
@@ -3502,7 +3656,8 @@ bad_swap:
spin_unlock(&swap_lock);
vfree(swap_map);
kvfree(zeromap);
- kvfree(cluster_info);
+ if (cluster_info)
+ free_cluster_info(cluster_info, maxpages);
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
if (swap_file)
@@ -3553,7 +3708,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
unsigned char has_cache;
int err, i;
- si = swp_swap_info(entry);
+ si = swap_entry_to_info(entry);
if (WARN_ON_ONCE(!si)) {
pr_err("%s%08lx\n", Bad_file, entry.val);
return -EINVAL;
@@ -3562,7 +3717,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
offset = swp_offset(entry);
VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
VM_WARN_ON(usage == 1 && nr > 1);
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
err = 0;
for (i = 0; i < nr; i++) {
@@ -3617,7 +3772,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
}
unlock_out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
return err;
}
@@ -3668,11 +3823,6 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
swap_entries_put_cache(si, entry, nr);
}
-struct swap_info_struct *swp_swap_info(swp_entry_t entry)
-{
- return swap_type_to_swap_info(swp_type(entry));
-}
-
/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
@@ -3716,7 +3866,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
offset = swp_offset(entry);
- ci = lock_cluster(si, offset);
+ ci = swap_cluster_lock(si, offset);
count = swap_count(si->swap_map[offset]);
@@ -3776,7 +3926,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
out_unlock_cont:
spin_unlock(&si->cont_lock);
out:
- unlock_cluster(ci);
+ swap_cluster_unlock(ci);
put_swap_device(si);
outer:
if (page)
@@ -3950,6 +4100,16 @@ static int __init swapfile_init(void)
swapfile_maximum_size = arch_max_swapfile_size();
+ /*
+ * Once a cluster is freed, it's swap table content is read
+ * only, and all swap cache readers (swap_cache_*) verifies
+ * the content before use. So it's safe to use RCU slab here.
+ */
+ if (!SWP_TABLE_USE_PAGE)
+ swap_table_cachep = kmem_cache_create("swap_table",
+ sizeof(struct swap_table),
+ 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
+
#ifdef CONFIG_MIGRATION
if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
swap_migration_ad_supported = true;