diff options
Diffstat (limited to 'mm/swap_state.c')
-rw-r--r-- | mm/swap_state.c | 308 |
1 files changed, 106 insertions, 202 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c index bfc7e8c58a6d..c354435a0923 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -20,7 +20,6 @@ #include <linux/blkdev.h> #include <linux/migrate.h> #include <linux/vmalloc.h> -#include <linux/swap_slots.h> #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" @@ -28,10 +27,9 @@ /* * swapper_space is a fiction, retained to simplify the path through - * vmscan's shrink_page_list. + * vmscan's shrink_folio_list. */ static const struct address_space_operations swap_aops = { - .writepage = swap_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, @@ -42,6 +40,8 @@ struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; static bool enable_vma_readahead __read_mostly = true; +#define SWAP_RA_ORDER_CEILING 5 + #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK @@ -72,24 +72,24 @@ void show_swap_cache_info(void) void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swp_offset(entry); - struct page *page; + pgoff_t idx = swap_cache_index(entry); + void *shadow; - page = xa_load(&address_space->i_pages, idx); - if (xa_is_value(page)) - return page; + shadow = xa_load(&address_space->i_pages, idx); + if (xa_is_value(shadow)) + return shadow; return NULL; } /* * add_to_swap_cache resembles filemap_add_folio on swapper_space, - * but sets SwapCache flag and private instead of mapping and index. + * but sets SwapCache flag and 'swap' instead of mapping and index. */ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); unsigned long i, nr = folio_nr_pages(folio); void *old; @@ -144,7 +144,7 @@ void __delete_from_swap_cache(struct folio *folio, struct address_space *address_space = swap_address_space(entry); int i; long nr = folio_nr_pages(folio); - pgoff_t idx = swp_offset(entry); + pgoff_t idx = swap_cache_index(entry); XA_STATE(xas, &address_space->i_pages, idx); xas_set_update(&xas, workingset_update_node); @@ -165,67 +165,6 @@ void __delete_from_swap_cache(struct folio *folio, __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); } -/** - * add_to_swap - allocate swap space for a folio - * @folio: folio we want to move to swap - * - * Allocate swap space for the folio and add the folio to the - * swap cache. - * - * Context: Caller needs to hold the folio lock. - * Return: Whether the folio was added to the swap cache. - */ -bool add_to_swap(struct folio *folio) -{ - swp_entry_t entry; - int err; - - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); - - entry = folio_alloc_swap(folio); - if (!entry.val) - return false; - - /* - * XArray node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - /* - * Add it to the swap cache. - */ - err = add_to_swap_cache(folio, entry, - __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); - if (err) - /* - * add_to_swap_cache() doesn't return -EEXIST, so we can safely - * clear SWAP_HAS_CACHE flag. - */ - goto fail; - /* - * Normally the folio will be dirtied in unmap because its - * pte should be dirty. A special case is MADV_FREE page. The - * page's pte could have dirty bit cleared but the folio's - * SwapBacked flag is still set because clearing the dirty bit - * and SwapBacked flag has no lock protected. For such folio, - * unmap will not set dirty bit for it, so folio reclaim will - * not write the folio out. This can cause data corruption when - * the folio is swapped in later. Always setting the dirty flag - * for the folio solves the problem. - */ - folio_mark_dirty(folio); - - return true; - -fail: - put_swap_folio(folio, entry); - return false; -} - /* * This must be called only on folios that have * been verified to be in the swap cache and locked. @@ -253,13 +192,14 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, for (;;) { swp_entry_t entry = swp_entry(type, curr); + unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; struct address_space *address_space = swap_address_space(entry); - XA_STATE(xas, &address_space->i_pages, curr); + XA_STATE(xas, &address_space->i_pages, index); xas_set_update(&xas, workingset_update_node); xa_lock_irq(&address_space->i_pages); - xas_for_each(&xas, old, end) { + xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { if (!xa_is_value(old)) continue; xas_store(&xas, NULL); @@ -267,9 +207,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ - curr >>= SWAP_ADDRESS_SPACE_SHIFT; - curr++; - curr <<= SWAP_ADDRESS_SPACE_SHIFT; + curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); if (curr > end) break; } @@ -293,15 +231,13 @@ void free_swap_cache(struct folio *folio) } /* - * Perform a free_page(), also freeing any swap cache associated with - * this page if it is the last user of the page. + * Freeing a folio and also freeing any swap cache associated with + * this folio if it is the last user. */ -void free_page_and_swap_cache(struct page *page) +void free_folio_and_swap_cache(struct folio *folio) { - struct folio *folio = page_folio(page); - free_swap_cache(folio); - if (!is_huge_zero_page(page)) + if (!is_huge_zero_folio(folio)) folio_put(folio); } @@ -314,7 +250,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) struct folio_batch folios; unsigned int refs[PAGEVEC_SIZE]; - lru_add_drain(); folio_batch_init(&folios); for (int i = 0; i < nr; i++) { struct folio *folio = page_folio(encoded_page_ptr(pages[i])); @@ -350,7 +285,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, { struct folio *folio; - folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); if (!IS_ERR(folio)) { bool vma_ra = swap_use_vma_readahead(); bool readahead; @@ -420,7 +355,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, si = get_swap_device(swp); if (!si) return ERR_PTR(-ENOENT); - index = swp_offset(swp); + index = swap_cache_index(swp); folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); return folio; @@ -430,15 +365,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { - struct swap_info_struct *si; + struct swap_info_struct *si = swp_swap_info(entry); struct folio *folio; + struct folio *new_folio = NULL; + struct folio *result = NULL; void *shadow = NULL; *new_page_allocated = false; - si = get_swap_device(entry); - if (!si) - return NULL; - for (;;) { int err; /* @@ -447,41 +380,36 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * that would confuse statistics. */ folio = filemap_get_folio(swap_address_space(entry), - swp_offset(entry)); + swap_cache_index(entry)); if (!IS_ERR(folio)) goto got_folio; /* * Just skip read ahead for unused swap slot. - * During swap_off when swap_slot_cache is disabled, - * we have to handle the race between putting - * swap entry in swap cache and marking swap slot - * as SWAP_HAS_CACHE. That's done in later part of code or - * else swap_off will be aborted if we return NULL. */ - if (!swap_swapcount(si, entry) && swap_slot_cache_enabled) - goto fail_put_swap; + if (!swap_entry_swapped(si, entry)) + goto put_and_return; /* - * Get a new folio to read into from swap. Allocate it now, - * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will - * cause any racers to loop around until we add it to cache. + * Get a new folio to read into from swap. Allocate it now if + * new_folio not exist, before marking swap_map SWAP_HAS_CACHE, + * when -EEXIST will cause any racers to loop around until we + * add it to cache. */ - folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, - mpol, ilx, numa_node_id()); - if (!folio) - goto fail_put_swap; + if (!new_folio) { + new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id()); + if (!new_folio) + goto put_and_return; + } /* * Swap entry may have been freed since our caller observed it. */ - err = swapcache_prepare(entry); + err = swapcache_prepare(entry, 1); if (!err) break; - - folio_put(folio); - if (err != -EEXIST) - goto fail_put_swap; + else if (err != -EEXIST) + goto put_and_return; /* * Protect against a recursive call to __read_swap_cache_async() @@ -492,7 +420,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * __read_swap_cache_async() in the writeback path. */ if (skip_if_exists) - goto fail_put_swap; + goto put_and_return; /* * We might race against __delete_from_swap_cache(), and @@ -507,36 +435,36 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * The swap entry is ours to swap in. Prepare the new folio. */ + __folio_set_locked(new_folio); + __folio_set_swapbacked(new_folio); - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - - if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; - mem_cgroup_swapin_uncharge_swap(entry); + memcg1_swapin(entry, 1); if (shadow) - workingset_refault(folio, shadow); + workingset_refault(new_folio, shadow); - /* Caller will initiate read into locked folio */ - folio_add_lru(folio); + /* Caller will initiate read into locked new_folio */ + folio_add_lru(new_folio); *new_page_allocated = true; + folio = new_folio; got_folio: - put_swap_device(si); - return folio; + result = folio; + goto put_and_return; fail_unlock: - put_swap_folio(folio, entry); - folio_unlock(folio); - folio_put(folio); -fail_put_swap: - put_swap_device(si); - return NULL; + put_swap_folio(new_folio, entry); + folio_unlock(new_folio); +put_and_return: + if (!(*new_page_allocated) && new_folio) + folio_put(new_folio); + return result; } /* @@ -553,18 +481,25 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { + struct swap_info_struct *si; bool page_allocated; struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; + si = get_swap_device(entry); + if (!si) + return NULL; + mpol = get_vma_policy(vma, addr, 0, &ilx); folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); mpol_cond_put(mpol); if (page_allocated) - swap_read_folio(folio, false, plug); + swap_read_folio(folio, plug); + + put_swap_device(si); return folio; } @@ -681,7 +616,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!folio) continue; if (page_allocated) { - swap_read_folio(folio, false, &splug); + swap_read_folio(folio, &splug); if (offset != entry_offset) { folio_set_readahead(folio); count_vm_event(SWAP_RA); @@ -696,10 +631,8 @@ skip: /* The page was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, &page_allocated, false); - if (unlikely(page_allocated)) { - zswap_folio_swapin(folio); - swap_read_folio(folio, false, NULL); - } + if (unlikely(page_allocated)) + swap_read_folio(folio, NULL); return folio; } @@ -738,62 +671,42 @@ void exit_swap_address_space(unsigned int type) swapper_spaces[type] = NULL; } -#define SWAP_RA_ORDER_CEILING 5 - -struct vma_swap_readahead { - unsigned short win; - unsigned short offset; - unsigned short nr_pte; -}; - -static void swap_ra_info(struct vm_fault *vmf, - struct vma_swap_readahead *ra_info) +static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, + unsigned long *end) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; - unsigned long faddr, pfn, fpfn, lpfn, rpfn; - unsigned long start, end; + unsigned long faddr, prev_faddr, left, right; unsigned int max_win, hits, prev_win, win; - max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), - SWAP_RA_ORDER_CEILING); - if (max_win == 1) { - ra_info->win = 1; - return; - } + max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); + if (max_win == 1) + return 1; faddr = vmf->address; - fpfn = PFN_DOWN(faddr); ra_val = GET_SWAP_RA_VAL(vma); - pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); + prev_faddr = SWAP_RA_ADDR(ra_val); prev_win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); - ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, - max_win, prev_win); - atomic_long_set(&vma->swap_readahead_info, - SWAP_RA_VAL(faddr, win, 0)); + win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits, + max_win, prev_win); + atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); if (win == 1) - return; - - if (fpfn == pfn + 1) { - lpfn = fpfn; - rpfn = fpfn + win; - } else if (pfn == fpfn + 1) { - lpfn = fpfn - win + 1; - rpfn = fpfn + 1; - } else { - unsigned int left = (win - 1) / 2; - - lpfn = fpfn - left; - rpfn = fpfn + win - left; - } - start = max3(lpfn, PFN_DOWN(vma->vm_start), - PFN_DOWN(faddr & PMD_MASK)); - end = min3(rpfn, PFN_DOWN(vma->vm_end), - PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); + return 1; - ra_info->nr_pte = end - start; - ra_info->offset = fpfn - start; + if (faddr == prev_faddr + PAGE_SIZE) + left = faddr; + else if (prev_faddr == faddr + PAGE_SIZE) + left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE; + else + left = faddr - (((win - 1) / 2) << PAGE_SHIFT); + right = left + (win << PAGE_SHIFT); + if ((long)left < 0) + left = 0; + *start = max3(left, vma->vm_start, faddr & PMD_MASK); + *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE); + + return win; } /** @@ -819,24 +732,20 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, struct swap_iocb *splug = NULL; struct folio *folio; pte_t *pte = NULL, pentry; - unsigned long addr; + int win; + unsigned long start, end, addr; swp_entry_t entry; pgoff_t ilx; - unsigned int i; bool page_allocated; - struct vma_swap_readahead ra_info = { - .win = 1, - }; - swap_ra_info(vmf, &ra_info); - if (ra_info.win == 1) + win = swap_vma_ra_win(vmf, &start, &end); + if (win == 1) goto skip; - addr = vmf->address - (ra_info.offset * PAGE_SIZE); - ilx = targ_ilx - ra_info.offset; + ilx = targ_ilx - PFN_DOWN(vmf->address - start); blk_start_plug(&plug); - for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { + for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -855,8 +764,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, if (!folio) continue; if (page_allocated) { - swap_read_folio(folio, false, &splug); - if (i != ra_info.offset) { + swap_read_folio(folio, &splug); + if (addr != vmf->address) { folio_set_readahead(folio); count_vm_event(SWAP_RA); } @@ -872,10 +781,8 @@ skip: /* The folio was likely read above, so no need for plugging here */ folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, &page_allocated, false); - if (unlikely(page_allocated)) { - zswap_folio_swapin(folio); - swap_read_folio(folio, false, NULL); - } + if (unlikely(page_allocated)) + swap_read_folio(folio, NULL); return folio; } @@ -885,13 +792,13 @@ skip: * @gfp_mask: memory allocation flags * @vmf: fault information * - * Returns the struct page for entry and addr, after queueing swapin. + * Returns the struct folio for entry and addr, after queueing swapin. * * It's a main entry function for swap readahead. By the configuration, * it will read ahead blocks by cluster-based(ie, physical disk based) * or vma-based(ie, virtual address based on faulty address) readahead. */ -struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, +struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { struct mempolicy *mpol; @@ -904,17 +811,14 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, swap_cluster_readahead(entry, gfp_mask, mpol, ilx); mpol_cond_put(mpol); - if (!folio) - return NULL; - return folio_file_page(folio, swp_offset(entry)); + return folio; } #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%s\n", - enable_vma_readahead ? "true" : "false"); + return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, |