summaryrefslogtreecommitdiff
path: root/mm/swap_state.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swap_state.c')
-rw-r--r--mm/swap_state.c488
1 files changed, 246 insertions, 242 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c354435a0923..b13e9c4baa90 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -23,6 +23,7 @@
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
+#include "swap_table.h"
#include "swap.h"
/*
@@ -36,8 +37,11 @@ static const struct address_space_operations swap_aops = {
#endif
};
-struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
-static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
+/* Set swap_space as read only as swap cache is handled by swap table */
+struct address_space swap_space __ro_after_init = {
+ .a_ops = &swap_aops,
+};
+
static bool enable_vma_readahead __read_mostly = true;
#define SWAP_RA_ORDER_CEILING 5
@@ -69,150 +73,237 @@ void show_swap_cache_info(void)
printk("Total swap = %lukB\n", K(total_swap_pages));
}
-void *get_shadow_from_swap_cache(swp_entry_t entry)
+/**
+ * swap_cache_get_folio - Looks up a folio in the swap cache.
+ * @entry: swap entry used for the lookup.
+ *
+ * A found folio will be returned unlocked and with its refcount increased.
+ *
+ * Context: Caller must ensure @entry is valid and protect the swap device
+ * with reference count or locks.
+ * Return: Returns the found folio on success, NULL otherwise. The caller
+ * must lock nd check if the folio still matches the swap entry before
+ * use (e.g., folio_matches_swap_entry).
+ */
+struct folio *swap_cache_get_folio(swp_entry_t entry)
{
- struct address_space *address_space = swap_address_space(entry);
- pgoff_t idx = swap_cache_index(entry);
- void *shadow;
+ unsigned long swp_tb;
+ struct folio *folio;
+
+ for (;;) {
+ swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+ swp_cluster_offset(entry));
+ if (!swp_tb_is_folio(swp_tb))
+ return NULL;
+ folio = swp_tb_to_folio(swp_tb);
+ if (likely(folio_try_get(folio)))
+ return folio;
+ }
- shadow = xa_load(&address_space->i_pages, idx);
- if (xa_is_value(shadow))
- return shadow;
return NULL;
}
-/*
- * add_to_swap_cache resembles filemap_add_folio on swapper_space,
- * but sets SwapCache flag and 'swap' instead of mapping and index.
+/**
+ * swap_cache_get_shadow - Looks up a shadow in the swap cache.
+ * @entry: swap entry used for the lookup.
+ *
+ * Context: Caller must ensure @entry is valid and protect the swap device
+ * with reference count or locks.
+ * Return: Returns either NULL or an XA_VALUE (shadow).
*/
-int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
- gfp_t gfp, void **shadowp)
+void *swap_cache_get_shadow(swp_entry_t entry)
{
- struct address_space *address_space = swap_address_space(entry);
- pgoff_t idx = swap_cache_index(entry);
- XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
- unsigned long i, nr = folio_nr_pages(folio);
- void *old;
+ unsigned long swp_tb;
- xas_set_update(&xas, workingset_update_node);
+ swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+ swp_cluster_offset(entry));
+ if (swp_tb_is_shadow(swp_tb))
+ return swp_tb_to_shadow(swp_tb);
+ return NULL;
+}
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
+/**
+ * swap_cache_add_folio - Add a folio into the swap cache.
+ * @folio: The folio to be added.
+ * @entry: The swap entry corresponding to the folio.
+ * @gfp: gfp_mask for XArray node allocation.
+ * @shadowp: If a shadow is found, return the shadow.
+ *
+ * Context: Caller must ensure @entry is valid and protect the swap device
+ * with reference count or locks.
+ * The caller also needs to update the corresponding swap_map slots with
+ * SWAP_HAS_CACHE bit to avoid race or conflict.
+ */
+void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp)
+{
+ void *shadow = NULL;
+ unsigned long old_tb, new_tb;
+ struct swap_cluster_info *ci;
+ unsigned int ci_start, ci_off, ci_end;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
+
+ new_tb = folio_to_swp_tb(folio);
+ ci_start = swp_cluster_offset(entry);
+ ci_end = ci_start + nr_pages;
+ ci_off = ci_start;
+ ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+ do {
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(swp_tb_is_folio(old_tb));
+ if (swp_tb_is_shadow(old_tb))
+ shadow = swp_tb_to_shadow(old_tb);
+ } while (++ci_off < ci_end);
- folio_ref_add(folio, nr);
+ folio_ref_add(folio, nr_pages);
folio_set_swapcache(folio);
folio->swap = entry;
+ swap_cluster_unlock(ci);
- do {
- xas_lock_irq(&xas);
- xas_create_range(&xas);
- if (xas_error(&xas))
- goto unlock;
- for (i = 0; i < nr; i++) {
- VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
- if (shadowp) {
- old = xas_load(&xas);
- if (xa_is_value(old))
- *shadowp = old;
- }
- xas_store(&xas, folio);
- xas_next(&xas);
- }
- address_space->nrpages += nr;
- __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
- __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
-unlock:
- xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp));
-
- if (!xas_error(&xas))
- return 0;
+ node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages);
- folio_clear_swapcache(folio);
- folio_ref_sub(folio, nr);
- return xas_error(&xas);
+ if (shadowp)
+ *shadowp = shadow;
}
-/*
- * This must be called only on folios that have
- * been verified to be in the swap cache.
+/**
+ * __swap_cache_del_folio - Removes a folio from the swap cache.
+ * @ci: The locked swap cluster.
+ * @folio: The folio.
+ * @entry: The first swap entry that the folio corresponds to.
+ * @shadow: shadow value to be filled in the swap cache.
+ *
+ * Removes a folio from the swap cache and fills a shadow in place.
+ * This won't put the folio's refcount. The caller has to do that.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache
+ * using the index of @entry, and lock the cluster that holds the entries.
*/
-void __delete_from_swap_cache(struct folio *folio,
- swp_entry_t entry, void *shadow)
+void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
+ swp_entry_t entry, void *shadow)
{
- struct address_space *address_space = swap_address_space(entry);
- int i;
- long nr = folio_nr_pages(folio);
- pgoff_t idx = swap_cache_index(entry);
- XA_STATE(xas, &address_space->i_pages, idx);
-
- xas_set_update(&xas, workingset_update_node);
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
- VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
-
- for (i = 0; i < nr; i++) {
- void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_PAGE(entry != folio, entry);
- xas_next(&xas);
- }
+ unsigned long old_tb, new_tb;
+ unsigned int ci_start, ci_off, ci_end;
+ unsigned long nr_pages = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
+
+ new_tb = shadow_swp_to_tb(shadow);
+ ci_start = swp_cluster_offset(entry);
+ ci_end = ci_start + nr_pages;
+ ci_off = ci_start;
+ do {
+ /* If shadow is NULL, we sets an empty shadow */
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
+ swp_tb_to_folio(old_tb) != folio);
+ } while (++ci_off < ci_end);
+
folio->swap.val = 0;
folio_clear_swapcache(folio);
- address_space->nrpages -= nr;
- __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
- __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
+ node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages);
+ lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
}
-/*
- * This must be called only on folios that have
- * been verified to be in the swap cache and locked.
- * It will never put the folio into the free list,
- * the caller has a reference on the folio.
+/**
+ * swap_cache_del_folio - Removes a folio from the swap cache.
+ * @folio: The folio.
+ *
+ * Same as __swap_cache_del_folio, but handles lock and refcount. The
+ * caller must ensure the folio is either clean or has a swap count
+ * equal to zero, or it may cause data loss.
+ *
+ * Context: Caller must ensure the folio is locked and in the swap cache.
*/
-void delete_from_swap_cache(struct folio *folio)
+void swap_cache_del_folio(struct folio *folio)
{
+ struct swap_cluster_info *ci;
swp_entry_t entry = folio->swap;
- struct address_space *address_space = swap_address_space(entry);
- xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(folio, entry, NULL);
- xa_unlock_irq(&address_space->i_pages);
+ ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry));
+ __swap_cache_del_folio(ci, folio, entry, NULL);
+ swap_cluster_unlock(ci);
put_swap_folio(folio, entry);
folio_ref_sub(folio, folio_nr_pages(folio));
}
-void clear_shadow_from_swap_cache(int type, unsigned long begin,
- unsigned long end)
+/**
+ * __swap_cache_replace_folio - Replace a folio in the swap cache.
+ * @ci: The locked swap cluster.
+ * @old: The old folio to be replaced.
+ * @new: The new folio.
+ *
+ * Replace an existing folio in the swap cache with a new folio. The
+ * caller is responsible for setting up the new folio's flag and swap
+ * entries. Replacement will take the new folio's swap entry value as
+ * the starting offset to override all slots covered by the new folio.
+ *
+ * Context: Caller must ensure both folios are locked, and lock the
+ * cluster that holds the old folio to be replaced.
+ */
+void __swap_cache_replace_folio(struct swap_cluster_info *ci,
+ struct folio *old, struct folio *new)
{
- unsigned long curr = begin;
- void *old;
-
- for (;;) {
- swp_entry_t entry = swp_entry(type, curr);
- unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK;
- struct address_space *address_space = swap_address_space(entry);
- XA_STATE(xas, &address_space->i_pages, index);
-
- xas_set_update(&xas, workingset_update_node);
-
- xa_lock_irq(&address_space->i_pages);
- xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) {
- if (!xa_is_value(old))
- continue;
- xas_store(&xas, NULL);
- }
- xa_unlock_irq(&address_space->i_pages);
+ swp_entry_t entry = new->swap;
+ unsigned long nr_pages = folio_nr_pages(new);
+ unsigned int ci_off = swp_cluster_offset(entry);
+ unsigned int ci_end = ci_off + nr_pages;
+ unsigned long old_tb, new_tb;
+
+ VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new));
+ VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new));
+ VM_WARN_ON_ONCE(!entry.val);
+
+ /* Swap cache still stores N entries instead of a high-order entry */
+ new_tb = folio_to_swp_tb(new);
+ do {
+ old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+ WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old);
+ } while (++ci_off < ci_end);
- /* search the next swapcache until we meet end */
- curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES);
- if (curr > end)
- break;
+ /*
+ * If the old folio is partially replaced (e.g., splitting a large
+ * folio, the old folio is shrunk, and new split sub folios replace
+ * the shrunk part), ensure the new folio doesn't overlap it.
+ */
+ if (IS_ENABLED(CONFIG_DEBUG_VM) &&
+ folio_order(old) != folio_order(new)) {
+ ci_off = swp_cluster_offset(old->swap);
+ ci_end = ci_off + folio_nr_pages(old);
+ while (ci_off++ < ci_end)
+ WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old);
}
}
+/**
+ * swap_cache_clear_shadow - Clears a set of shadows in the swap cache.
+ * @entry: The starting index entry.
+ * @nr_ents: How many slots need to be cleared.
+ *
+ * Context: Caller must ensure the range is valid, all in one single cluster,
+ * not occupied by any folio, and lock the cluster.
+ */
+void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents)
+{
+ struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
+ unsigned int ci_off = swp_cluster_offset(entry), ci_end;
+ unsigned long old;
+
+ ci_end = ci_off + nr_ents;
+ do {
+ old = __swap_table_xchg(ci, ci_off, null_to_swp_tb());
+ WARN_ON_ONCE(swp_tb_is_folio(old));
+ } while (++ci_off < ci_end);
+}
+
/*
* If we are the only user, then try to free up the swap cache.
*
@@ -272,100 +363,50 @@ static inline bool swap_use_vma_readahead(void)
return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
}
-/*
- * Lookup a swap entry in the swap cache. A found folio will be returned
- * unlocked and with its refcount incremented - we rely on the kernel
- * lock getting page table operations atomic even if we drop the folio
- * lock before returning.
- *
- * Caller must lock the swap device or hold a reference to keep it valid.
+/**
+ * swap_update_readahead - Update the readahead statistics of VMA or globally.
+ * @folio: the swap cache folio that just got hit.
+ * @vma: the VMA that should be updated, could be NULL for global update.
+ * @addr: the addr that triggered the swapin, ignored if @vma is NULL.
*/
-struct folio *swap_cache_get_folio(swp_entry_t entry,
- struct vm_area_struct *vma, unsigned long addr)
+void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct folio *folio;
-
- folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
- if (!IS_ERR(folio)) {
- bool vma_ra = swap_use_vma_readahead();
- bool readahead;
+ bool readahead, vma_ra = swap_use_vma_readahead();
- /*
- * At the moment, we don't support PG_readahead for anon THP
- * so let's bail out rather than confusing the readahead stat.
- */
- if (unlikely(folio_test_large(folio)))
- return folio;
-
- readahead = folio_test_clear_readahead(folio);
- if (vma && vma_ra) {
- unsigned long ra_val;
- int win, hits;
-
- ra_val = GET_SWAP_RA_VAL(vma);
- win = SWAP_RA_WIN(ra_val);
- hits = SWAP_RA_HITS(ra_val);
- if (readahead)
- hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
- atomic_long_set(&vma->swap_readahead_info,
- SWAP_RA_VAL(addr, win, hits));
- }
-
- if (readahead) {
- count_vm_event(SWAP_RA_HIT);
- if (!vma || !vma_ra)
- atomic_inc(&swapin_readahead_hits);
- }
- } else {
- folio = NULL;
+ /*
+ * At the moment, we don't support PG_readahead for anon THP
+ * so let's bail out rather than confusing the readahead stat.
+ */
+ if (unlikely(folio_test_large(folio)))
+ return;
+
+ readahead = folio_test_clear_readahead(folio);
+ if (vma && vma_ra) {
+ unsigned long ra_val;
+ int win, hits;
+
+ ra_val = GET_SWAP_RA_VAL(vma);
+ win = SWAP_RA_WIN(ra_val);
+ hits = SWAP_RA_HITS(ra_val);
+ if (readahead)
+ hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+ atomic_long_set(&vma->swap_readahead_info,
+ SWAP_RA_VAL(addr, win, hits));
}
- return folio;
-}
-
-/**
- * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
- * @mapping: The address_space to search.
- * @index: The page cache index.
- *
- * This differs from filemap_get_folio() in that it will also look for the
- * folio in the swap cache.
- *
- * Return: The found folio or %NULL.
- */
-struct folio *filemap_get_incore_folio(struct address_space *mapping,
- pgoff_t index)
-{
- swp_entry_t swp;
- struct swap_info_struct *si;
- struct folio *folio = filemap_get_entry(mapping, index);
-
- if (!folio)
- return ERR_PTR(-ENOENT);
- if (!xa_is_value(folio))
- return folio;
- if (!shmem_mapping(mapping))
- return ERR_PTR(-ENOENT);
-
- swp = radix_to_swp_entry(folio);
- /* There might be swapin error entries in shmem mapping. */
- if (non_swap_entry(swp))
- return ERR_PTR(-ENOENT);
- /* Prevent swapoff from happening to us */
- si = get_swap_device(swp);
- if (!si)
- return ERR_PTR(-ENOENT);
- index = swap_cache_index(swp);
- folio = filemap_get_folio(swap_address_space(swp), index);
- put_swap_device(si);
- return folio;
+ if (readahead) {
+ count_vm_event(SWAP_RA_HIT);
+ if (!vma || !vma_ra)
+ atomic_inc(&swapin_readahead_hits);
+ }
}
struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
bool skip_if_exists)
{
- struct swap_info_struct *si = swp_swap_info(entry);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
struct folio *folio;
struct folio *new_folio = NULL;
struct folio *result = NULL;
@@ -374,14 +415,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*new_page_allocated = false;
for (;;) {
int err;
+
/*
- * First check the swap cache. Since this is normally
- * called after swap_cache_get_folio() failed, re-calling
- * that would confuse statistics.
+ * Check the swap cache first, if a cached folio is found,
+ * return it unlocked. The caller will lock and check it.
*/
- folio = filemap_get_folio(swap_address_space(entry),
- swap_cache_index(entry));
- if (!IS_ERR(folio))
+ folio = swap_cache_get_folio(entry);
+ if (folio)
goto got_folio;
/*
@@ -423,7 +463,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
goto put_and_return;
/*
- * We might race against __delete_from_swap_cache(), and
+ * We might race against __swap_cache_del_folio(), and
* stumble across a swap_map entry whose SWAP_HAS_CACHE
* has not yet been cleared. Or race against another
* __read_swap_cache_async(), which has set SWAP_HAS_CACHE
@@ -441,10 +481,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry))
goto fail_unlock;
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
- goto fail_unlock;
-
+ swap_cache_add_folio(new_folio, entry, &shadow);
memcg1_swapin(entry, 1);
if (shadow)
@@ -590,7 +627,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
unsigned long mask;
- struct swap_info_struct *si = swp_swap_info(entry);
+ struct swap_info_struct *si = __swap_entry_to_info(entry);
struct blk_plug plug;
struct swap_iocb *splug = NULL;
bool page_allocated;
@@ -636,41 +673,6 @@ skip:
return folio;
}
-int init_swap_address_space(unsigned int type, unsigned long nr_pages)
-{
- struct address_space *spaces, *space;
- unsigned int i, nr;
-
- nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
- spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
- if (!spaces)
- return -ENOMEM;
- for (i = 0; i < nr; i++) {
- space = spaces + i;
- xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
- atomic_set(&space->i_mmap_writable, 0);
- space->a_ops = &swap_aops;
- /* swap cache doesn't use writeback related tags */
- mapping_set_no_writeback_tags(space);
- }
- nr_swapper_spaces[type] = nr;
- swapper_spaces[type] = spaces;
-
- return 0;
-}
-
-void exit_swap_address_space(unsigned int type)
-{
- int i;
- struct address_space *spaces = swapper_spaces[type];
-
- for (i = 0; i < nr_swapper_spaces[type]; i++)
- VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
- kvfree(spaces);
- nr_swapper_spaces[type] = 0;
- swapper_spaces[type] = NULL;
-}
-
static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
unsigned long *end)
{
@@ -843,7 +845,7 @@ static const struct attribute_group swap_attr_group = {
.attrs = swap_attrs,
};
-static int __init swap_init_sysfs(void)
+static int __init swap_init(void)
{
int err;
struct kobject *swap_kobj;
@@ -858,11 +860,13 @@ static int __init swap_init_sysfs(void)
pr_err("failed to register swap group\n");
goto delete_obj;
}
+ /* Swap cache writeback is LRU based, no tags for it */
+ mapping_set_no_writeback_tags(&swap_space);
return 0;
delete_obj:
kobject_put(swap_kobj);
return err;
}
-subsys_initcall(swap_init_sysfs);
+subsys_initcall(swap_init);
#endif