diff options
Diffstat (limited to 'mm/swap_state.c')
-rw-r--r-- | mm/swap_state.c | 488 |
1 files changed, 246 insertions, 242 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c index c354435a0923..b13e9c4baa90 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -23,6 +23,7 @@ #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" +#include "swap_table.h" #include "swap.h" /* @@ -36,8 +37,11 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; -static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +/* Set swap_space as read only as swap cache is handled by swap table */ +struct address_space swap_space __ro_after_init = { + .a_ops = &swap_aops, +}; + static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_ORDER_CEILING 5 @@ -69,150 +73,237 @@ void show_swap_cache_info(void) printk("Total swap = %lukB\n", K(total_swap_pages)); } -void *get_shadow_from_swap_cache(swp_entry_t entry) +/** + * swap_cache_get_folio - Looks up a folio in the swap cache. + * @entry: swap entry used for the lookup. + * + * A found folio will be returned unlocked and with its refcount increased. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * Return: Returns the found folio on success, NULL otherwise. The caller + * must lock nd check if the folio still matches the swap entry before + * use (e.g., folio_matches_swap_entry). + */ +struct folio *swap_cache_get_folio(swp_entry_t entry) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - void *shadow; + unsigned long swp_tb; + struct folio *folio; + + for (;;) { + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (!swp_tb_is_folio(swp_tb)) + return NULL; + folio = swp_tb_to_folio(swp_tb); + if (likely(folio_try_get(folio))) + return folio; + } - shadow = xa_load(&address_space->i_pages, idx); - if (xa_is_value(shadow)) - return shadow; return NULL; } -/* - * add_to_swap_cache resembles filemap_add_folio on swapper_space, - * but sets SwapCache flag and 'swap' instead of mapping and index. +/** + * swap_cache_get_shadow - Looks up a shadow in the swap cache. + * @entry: swap entry used for the lookup. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * Return: Returns either NULL or an XA_VALUE (shadow). */ -int add_to_swap_cache(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp) +void *swap_cache_get_shadow(swp_entry_t entry) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); - unsigned long i, nr = folio_nr_pages(folio); - void *old; + unsigned long swp_tb; - xas_set_update(&xas, workingset_update_node); + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (swp_tb_is_shadow(swp_tb)) + return swp_tb_to_shadow(swp_tb); + return NULL; +} - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); +/** + * swap_cache_add_folio - Add a folio into the swap cache. + * @folio: The folio to be added. + * @entry: The swap entry corresponding to the folio. + * @gfp: gfp_mask for XArray node allocation. + * @shadowp: If a shadow is found, return the shadow. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * The caller also needs to update the corresponding swap_map slots with + * SWAP_HAS_CACHE bit to avoid race or conflict. + */ +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) +{ + void *shadow = NULL; + unsigned long old_tb, new_tb; + struct swap_cluster_info *ci; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + + new_tb = folio_to_swp_tb(folio); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + do { + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + if (swp_tb_is_shadow(old_tb)) + shadow = swp_tb_to_shadow(old_tb); + } while (++ci_off < ci_end); - folio_ref_add(folio, nr); + folio_ref_add(folio, nr_pages); folio_set_swapcache(folio); folio->swap = entry; + swap_cluster_unlock(ci); - do { - xas_lock_irq(&xas); - xas_create_range(&xas); - if (xas_error(&xas)) - goto unlock; - for (i = 0; i < nr; i++) { - VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); - if (shadowp) { - old = xas_load(&xas); - if (xa_is_value(old)) - *shadowp = old; - } - xas_store(&xas, folio); - xas_next(&xas); - } - address_space->nrpages += nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); -unlock: - xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); - - if (!xas_error(&xas)) - return 0; + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); - folio_clear_swapcache(folio); - folio_ref_sub(folio, nr); - return xas_error(&xas); + if (shadowp) + *shadowp = shadow; } -/* - * This must be called only on folios that have - * been verified to be in the swap cache. +/** + * __swap_cache_del_folio - Removes a folio from the swap cache. + * @ci: The locked swap cluster. + * @folio: The folio. + * @entry: The first swap entry that the folio corresponds to. + * @shadow: shadow value to be filled in the swap cache. + * + * Removes a folio from the swap cache and fills a shadow in place. + * This won't put the folio's refcount. The caller has to do that. + * + * Context: Caller must ensure the folio is locked and in the swap cache + * using the index of @entry, and lock the cluster that holds the entries. */ -void __delete_from_swap_cache(struct folio *folio, - swp_entry_t entry, void *shadow) +void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, + swp_entry_t entry, void *shadow) { - struct address_space *address_space = swap_address_space(entry); - int i; - long nr = folio_nr_pages(folio); - pgoff_t idx = swap_cache_index(entry); - XA_STATE(xas, &address_space->i_pages, idx); - - xas_set_update(&xas, workingset_update_node); - - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); - - for (i = 0; i < nr; i++) { - void *entry = xas_store(&xas, shadow); - VM_BUG_ON_PAGE(entry != folio, entry); - xas_next(&xas); - } + unsigned long old_tb, new_tb; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); + + new_tb = shadow_swp_to_tb(shadow); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + do { + /* If shadow is NULL, we sets an empty shadow */ + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || + swp_tb_to_folio(old_tb) != folio); + } while (++ci_off < ci_end); + folio->swap.val = 0; folio_clear_swapcache(folio); - address_space->nrpages -= nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); + node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); } -/* - * This must be called only on folios that have - * been verified to be in the swap cache and locked. - * It will never put the folio into the free list, - * the caller has a reference on the folio. +/** + * swap_cache_del_folio - Removes a folio from the swap cache. + * @folio: The folio. + * + * Same as __swap_cache_del_folio, but handles lock and refcount. The + * caller must ensure the folio is either clean or has a swap count + * equal to zero, or it may cause data loss. + * + * Context: Caller must ensure the folio is locked and in the swap cache. */ -void delete_from_swap_cache(struct folio *folio) +void swap_cache_del_folio(struct folio *folio) { + struct swap_cluster_info *ci; swp_entry_t entry = folio->swap; - struct address_space *address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(folio, entry, NULL); - xa_unlock_irq(&address_space->i_pages); + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + __swap_cache_del_folio(ci, folio, entry, NULL); + swap_cluster_unlock(ci); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } -void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end) +/** + * __swap_cache_replace_folio - Replace a folio in the swap cache. + * @ci: The locked swap cluster. + * @old: The old folio to be replaced. + * @new: The new folio. + * + * Replace an existing folio in the swap cache with a new folio. The + * caller is responsible for setting up the new folio's flag and swap + * entries. Replacement will take the new folio's swap entry value as + * the starting offset to override all slots covered by the new folio. + * + * Context: Caller must ensure both folios are locked, and lock the + * cluster that holds the old folio to be replaced. + */ +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { - unsigned long curr = begin; - void *old; - - for (;;) { - swp_entry_t entry = swp_entry(type, curr); - unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; - struct address_space *address_space = swap_address_space(entry); - XA_STATE(xas, &address_space->i_pages, index); - - xas_set_update(&xas, workingset_update_node); - - xa_lock_irq(&address_space->i_pages); - xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { - if (!xa_is_value(old)) - continue; - xas_store(&xas, NULL); - } - xa_unlock_irq(&address_space->i_pages); + swp_entry_t entry = new->swap; + unsigned long nr_pages = folio_nr_pages(new); + unsigned int ci_off = swp_cluster_offset(entry); + unsigned int ci_end = ci_off + nr_pages; + unsigned long old_tb, new_tb; + + VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); + VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); + VM_WARN_ON_ONCE(!entry.val); + + /* Swap cache still stores N entries instead of a high-order entry */ + new_tb = folio_to_swp_tb(new); + do { + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); + } while (++ci_off < ci_end); - /* search the next swapcache until we meet end */ - curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); - if (curr > end) - break; + /* + * If the old folio is partially replaced (e.g., splitting a large + * folio, the old folio is shrunk, and new split sub folios replace + * the shrunk part), ensure the new folio doesn't overlap it. + */ + if (IS_ENABLED(CONFIG_DEBUG_VM) && + folio_order(old) != folio_order(new)) { + ci_off = swp_cluster_offset(old->swap); + ci_end = ci_off + folio_nr_pages(old); + while (ci_off++ < ci_end) + WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); } } +/** + * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. + * @entry: The starting index entry. + * @nr_ents: How many slots need to be cleared. + * + * Context: Caller must ensure the range is valid, all in one single cluster, + * not occupied by any folio, and lock the cluster. + */ +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) +{ + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + unsigned int ci_off = swp_cluster_offset(entry), ci_end; + unsigned long old; + + ci_end = ci_off + nr_ents; + do { + old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); + WARN_ON_ONCE(swp_tb_is_folio(old)); + } while (++ci_off < ci_end); +} + /* * If we are the only user, then try to free up the swap cache. * @@ -272,100 +363,50 @@ static inline bool swap_use_vma_readahead(void) return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } -/* - * Lookup a swap entry in the swap cache. A found folio will be returned - * unlocked and with its refcount incremented - we rely on the kernel - * lock getting page table operations atomic even if we drop the folio - * lock before returning. - * - * Caller must lock the swap device or hold a reference to keep it valid. +/** + * swap_update_readahead - Update the readahead statistics of VMA or globally. + * @folio: the swap cache folio that just got hit. + * @vma: the VMA that should be updated, could be NULL for global update. + * @addr: the addr that triggered the swapin, ignored if @vma is NULL. */ -struct folio *swap_cache_get_folio(swp_entry_t entry, - struct vm_area_struct *vma, unsigned long addr) +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr) { - struct folio *folio; - - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); - if (!IS_ERR(folio)) { - bool vma_ra = swap_use_vma_readahead(); - bool readahead; + bool readahead, vma_ra = swap_use_vma_readahead(); - /* - * At the moment, we don't support PG_readahead for anon THP - * so let's bail out rather than confusing the readahead stat. - */ - if (unlikely(folio_test_large(folio))) - return folio; - - readahead = folio_test_clear_readahead(folio); - if (vma && vma_ra) { - unsigned long ra_val; - int win, hits; - - ra_val = GET_SWAP_RA_VAL(vma); - win = SWAP_RA_WIN(ra_val); - hits = SWAP_RA_HITS(ra_val); - if (readahead) - hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); - atomic_long_set(&vma->swap_readahead_info, - SWAP_RA_VAL(addr, win, hits)); - } - - if (readahead) { - count_vm_event(SWAP_RA_HIT); - if (!vma || !vma_ra) - atomic_inc(&swapin_readahead_hits); - } - } else { - folio = NULL; + /* + * At the moment, we don't support PG_readahead for anon THP + * so let's bail out rather than confusing the readahead stat. + */ + if (unlikely(folio_test_large(folio))) + return; + + readahead = folio_test_clear_readahead(folio); + if (vma && vma_ra) { + unsigned long ra_val; + int win, hits; + + ra_val = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); + if (readahead) + hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(addr, win, hits)); } - return folio; -} - -/** - * filemap_get_incore_folio - Find and get a folio from the page or swap caches. - * @mapping: The address_space to search. - * @index: The page cache index. - * - * This differs from filemap_get_folio() in that it will also look for the - * folio in the swap cache. - * - * Return: The found folio or %NULL. - */ -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index) -{ - swp_entry_t swp; - struct swap_info_struct *si; - struct folio *folio = filemap_get_entry(mapping, index); - - if (!folio) - return ERR_PTR(-ENOENT); - if (!xa_is_value(folio)) - return folio; - if (!shmem_mapping(mapping)) - return ERR_PTR(-ENOENT); - - swp = radix_to_swp_entry(folio); - /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(swp)) - return ERR_PTR(-ENOENT); - /* Prevent swapoff from happening to us */ - si = get_swap_device(swp); - if (!si) - return ERR_PTR(-ENOENT); - index = swap_cache_index(swp); - folio = filemap_get_folio(swap_address_space(swp), index); - put_swap_device(si); - return folio; + if (readahead) { + count_vm_event(SWAP_RA_HIT); + if (!vma || !vma_ra) + atomic_inc(&swapin_readahead_hits); + } } struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; struct folio *new_folio = NULL; struct folio *result = NULL; @@ -374,14 +415,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, *new_page_allocated = false; for (;;) { int err; + /* - * First check the swap cache. Since this is normally - * called after swap_cache_get_folio() failed, re-calling - * that would confuse statistics. + * Check the swap cache first, if a cached folio is found, + * return it unlocked. The caller will lock and check it. */ - folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); - if (!IS_ERR(folio)) + folio = swap_cache_get_folio(entry); + if (folio) goto got_folio; /* @@ -423,7 +463,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto put_and_return; /* - * We might race against __delete_from_swap_cache(), and + * We might race against __swap_cache_del_folio(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE @@ -441,10 +481,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; - /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) - goto fail_unlock; - + swap_cache_add_folio(new_folio, entry, &shadow); memcg1_swapin(entry, 1); if (shadow) @@ -590,7 +627,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; @@ -636,41 +673,6 @@ skip: return folio; } -int init_swap_address_space(unsigned int type, unsigned long nr_pages) -{ - struct address_space *spaces, *space; - unsigned int i, nr; - - nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); - spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); - if (!spaces) - return -ENOMEM; - for (i = 0; i < nr; i++) { - space = spaces + i; - xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); - atomic_set(&space->i_mmap_writable, 0); - space->a_ops = &swap_aops; - /* swap cache doesn't use writeback related tags */ - mapping_set_no_writeback_tags(space); - } - nr_swapper_spaces[type] = nr; - swapper_spaces[type] = spaces; - - return 0; -} - -void exit_swap_address_space(unsigned int type) -{ - int i; - struct address_space *spaces = swapper_spaces[type]; - - for (i = 0; i < nr_swapper_spaces[type]; i++) - VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); - kvfree(spaces); - nr_swapper_spaces[type] = 0; - swapper_spaces[type] = NULL; -} - static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, unsigned long *end) { @@ -843,7 +845,7 @@ static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; -static int __init swap_init_sysfs(void) +static int __init swap_init(void) { int err; struct kobject *swap_kobj; @@ -858,11 +860,13 @@ static int __init swap_init_sysfs(void) pr_err("failed to register swap group\n"); goto delete_obj; } + /* Swap cache writeback is LRU based, no tags for it */ + mapping_set_no_writeback_tags(&swap_space); return 0; delete_obj: kobject_put(swap_kobj); return err; } -subsys_initcall(swap_init_sysfs); +subsys_initcall(swap_init); #endif |