summaryrefslogtreecommitdiff
path: root/mm/swap_state.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swap_state.c')
-rw-r--r--mm/swap_state.c308
1 files changed, 106 insertions, 202 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c
index bfc7e8c58a6d..c354435a0923 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -20,7 +20,6 @@
#include <linux/blkdev.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
-#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
@@ -28,10 +27,9 @@
/*
* swapper_space is a fiction, retained to simplify the path through
- * vmscan's shrink_page_list.
+ * vmscan's shrink_folio_list.
*/
static const struct address_space_operations swap_aops = {
- .writepage = swap_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
.migrate_folio = migrate_folio,
@@ -42,6 +40,8 @@ struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
static bool enable_vma_readahead __read_mostly = true;
+#define SWAP_RA_ORDER_CEILING 5
+
#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK
@@ -72,24 +72,24 @@ void show_swap_cache_info(void)
void *get_shadow_from_swap_cache(swp_entry_t entry)
{
struct address_space *address_space = swap_address_space(entry);
- pgoff_t idx = swp_offset(entry);
- struct page *page;
+ pgoff_t idx = swap_cache_index(entry);
+ void *shadow;
- page = xa_load(&address_space->i_pages, idx);
- if (xa_is_value(page))
- return page;
+ shadow = xa_load(&address_space->i_pages, idx);
+ if (xa_is_value(shadow))
+ return shadow;
return NULL;
}
/*
* add_to_swap_cache resembles filemap_add_folio on swapper_space,
- * but sets SwapCache flag and private instead of mapping and index.
+ * but sets SwapCache flag and 'swap' instead of mapping and index.
*/
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp)
{
struct address_space *address_space = swap_address_space(entry);
- pgoff_t idx = swp_offset(entry);
+ pgoff_t idx = swap_cache_index(entry);
XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
unsigned long i, nr = folio_nr_pages(folio);
void *old;
@@ -144,7 +144,7 @@ void __delete_from_swap_cache(struct folio *folio,
struct address_space *address_space = swap_address_space(entry);
int i;
long nr = folio_nr_pages(folio);
- pgoff_t idx = swp_offset(entry);
+ pgoff_t idx = swap_cache_index(entry);
XA_STATE(xas, &address_space->i_pages, idx);
xas_set_update(&xas, workingset_update_node);
@@ -165,67 +165,6 @@ void __delete_from_swap_cache(struct folio *folio,
__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}
-/**
- * add_to_swap - allocate swap space for a folio
- * @folio: folio we want to move to swap
- *
- * Allocate swap space for the folio and add the folio to the
- * swap cache.
- *
- * Context: Caller needs to hold the folio lock.
- * Return: Whether the folio was added to the swap cache.
- */
-bool add_to_swap(struct folio *folio)
-{
- swp_entry_t entry;
- int err;
-
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
-
- entry = folio_alloc_swap(folio);
- if (!entry.val)
- return false;
-
- /*
- * XArray node allocations from PF_MEMALLOC contexts could
- * completely exhaust the page allocator. __GFP_NOMEMALLOC
- * stops emergency reserves from being allocated.
- *
- * TODO: this could cause a theoretical memory reclaim
- * deadlock in the swap out path.
- */
- /*
- * Add it to the swap cache.
- */
- err = add_to_swap_cache(folio, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
- if (err)
- /*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
- */
- goto fail;
- /*
- * Normally the folio will be dirtied in unmap because its
- * pte should be dirty. A special case is MADV_FREE page. The
- * page's pte could have dirty bit cleared but the folio's
- * SwapBacked flag is still set because clearing the dirty bit
- * and SwapBacked flag has no lock protected. For such folio,
- * unmap will not set dirty bit for it, so folio reclaim will
- * not write the folio out. This can cause data corruption when
- * the folio is swapped in later. Always setting the dirty flag
- * for the folio solves the problem.
- */
- folio_mark_dirty(folio);
-
- return true;
-
-fail:
- put_swap_folio(folio, entry);
- return false;
-}
-
/*
* This must be called only on folios that have
* been verified to be in the swap cache and locked.
@@ -253,13 +192,14 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
for (;;) {
swp_entry_t entry = swp_entry(type, curr);
+ unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK;
struct address_space *address_space = swap_address_space(entry);
- XA_STATE(xas, &address_space->i_pages, curr);
+ XA_STATE(xas, &address_space->i_pages, index);
xas_set_update(&xas, workingset_update_node);
xa_lock_irq(&address_space->i_pages);
- xas_for_each(&xas, old, end) {
+ xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) {
if (!xa_is_value(old))
continue;
xas_store(&xas, NULL);
@@ -267,9 +207,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
xa_unlock_irq(&address_space->i_pages);
/* search the next swapcache until we meet end */
- curr >>= SWAP_ADDRESS_SPACE_SHIFT;
- curr++;
- curr <<= SWAP_ADDRESS_SPACE_SHIFT;
+ curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES);
if (curr > end)
break;
}
@@ -293,15 +231,13 @@ void free_swap_cache(struct folio *folio)
}
/*
- * Perform a free_page(), also freeing any swap cache associated with
- * this page if it is the last user of the page.
+ * Freeing a folio and also freeing any swap cache associated with
+ * this folio if it is the last user.
*/
-void free_page_and_swap_cache(struct page *page)
+void free_folio_and_swap_cache(struct folio *folio)
{
- struct folio *folio = page_folio(page);
-
free_swap_cache(folio);
- if (!is_huge_zero_page(page))
+ if (!is_huge_zero_folio(folio))
folio_put(folio);
}
@@ -314,7 +250,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
struct folio_batch folios;
unsigned int refs[PAGEVEC_SIZE];
- lru_add_drain();
folio_batch_init(&folios);
for (int i = 0; i < nr; i++) {
struct folio *folio = page_folio(encoded_page_ptr(pages[i]));
@@ -350,7 +285,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
{
struct folio *folio;
- folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
+ folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
if (!IS_ERR(folio)) {
bool vma_ra = swap_use_vma_readahead();
bool readahead;
@@ -420,7 +355,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
si = get_swap_device(swp);
if (!si)
return ERR_PTR(-ENOENT);
- index = swp_offset(swp);
+ index = swap_cache_index(swp);
folio = filemap_get_folio(swap_address_space(swp), index);
put_swap_device(si);
return folio;
@@ -430,15 +365,13 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
bool skip_if_exists)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = swp_swap_info(entry);
struct folio *folio;
+ struct folio *new_folio = NULL;
+ struct folio *result = NULL;
void *shadow = NULL;
*new_page_allocated = false;
- si = get_swap_device(entry);
- if (!si)
- return NULL;
-
for (;;) {
int err;
/*
@@ -447,41 +380,36 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* that would confuse statistics.
*/
folio = filemap_get_folio(swap_address_space(entry),
- swp_offset(entry));
+ swap_cache_index(entry));
if (!IS_ERR(folio))
goto got_folio;
/*
* Just skip read ahead for unused swap slot.
- * During swap_off when swap_slot_cache is disabled,
- * we have to handle the race between putting
- * swap entry in swap cache and marking swap slot
- * as SWAP_HAS_CACHE. That's done in later part of code or
- * else swap_off will be aborted if we return NULL.
*/
- if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
- goto fail_put_swap;
+ if (!swap_entry_swapped(si, entry))
+ goto put_and_return;
/*
- * Get a new folio to read into from swap. Allocate it now,
- * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
- * cause any racers to loop around until we add it to cache.
+ * Get a new folio to read into from swap. Allocate it now if
+ * new_folio not exist, before marking swap_map SWAP_HAS_CACHE,
+ * when -EEXIST will cause any racers to loop around until we
+ * add it to cache.
*/
- folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0,
- mpol, ilx, numa_node_id());
- if (!folio)
- goto fail_put_swap;
+ if (!new_folio) {
+ new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id());
+ if (!new_folio)
+ goto put_and_return;
+ }
/*
* Swap entry may have been freed since our caller observed it.
*/
- err = swapcache_prepare(entry);
+ err = swapcache_prepare(entry, 1);
if (!err)
break;
-
- folio_put(folio);
- if (err != -EEXIST)
- goto fail_put_swap;
+ else if (err != -EEXIST)
+ goto put_and_return;
/*
* Protect against a recursive call to __read_swap_cache_async()
@@ -492,7 +420,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* __read_swap_cache_async() in the writeback path.
*/
if (skip_if_exists)
- goto fail_put_swap;
+ goto put_and_return;
/*
* We might race against __delete_from_swap_cache(), and
@@ -507,36 +435,36 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* The swap entry is ours to swap in. Prepare the new folio.
*/
+ __folio_set_locked(new_folio);
+ __folio_set_swapbacked(new_folio);
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
-
- if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
+ if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry))
goto fail_unlock;
/* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
+ if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
- mem_cgroup_swapin_uncharge_swap(entry);
+ memcg1_swapin(entry, 1);
if (shadow)
- workingset_refault(folio, shadow);
+ workingset_refault(new_folio, shadow);
- /* Caller will initiate read into locked folio */
- folio_add_lru(folio);
+ /* Caller will initiate read into locked new_folio */
+ folio_add_lru(new_folio);
*new_page_allocated = true;
+ folio = new_folio;
got_folio:
- put_swap_device(si);
- return folio;
+ result = folio;
+ goto put_and_return;
fail_unlock:
- put_swap_folio(folio, entry);
- folio_unlock(folio);
- folio_put(folio);
-fail_put_swap:
- put_swap_device(si);
- return NULL;
+ put_swap_folio(new_folio, entry);
+ folio_unlock(new_folio);
+put_and_return:
+ if (!(*new_page_allocated) && new_folio)
+ folio_put(new_folio);
+ return result;
}
/*
@@ -553,18 +481,25 @@ struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
struct swap_iocb **plug)
{
+ struct swap_info_struct *si;
bool page_allocated;
struct mempolicy *mpol;
pgoff_t ilx;
struct folio *folio;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
+
mpol = get_vma_policy(vma, addr, 0, &ilx);
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
mpol_cond_put(mpol);
if (page_allocated)
- swap_read_folio(folio, false, plug);
+ swap_read_folio(folio, plug);
+
+ put_swap_device(si);
return folio;
}
@@ -681,7 +616,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!folio)
continue;
if (page_allocated) {
- swap_read_folio(folio, false, &splug);
+ swap_read_folio(folio, &splug);
if (offset != entry_offset) {
folio_set_readahead(folio);
count_vm_event(SWAP_RA);
@@ -696,10 +631,8 @@ skip:
/* The page was likely read above, so no need for plugging here */
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
- if (unlikely(page_allocated)) {
- zswap_folio_swapin(folio);
- swap_read_folio(folio, false, NULL);
- }
+ if (unlikely(page_allocated))
+ swap_read_folio(folio, NULL);
return folio;
}
@@ -738,62 +671,42 @@ void exit_swap_address_space(unsigned int type)
swapper_spaces[type] = NULL;
}
-#define SWAP_RA_ORDER_CEILING 5
-
-struct vma_swap_readahead {
- unsigned short win;
- unsigned short offset;
- unsigned short nr_pte;
-};
-
-static void swap_ra_info(struct vm_fault *vmf,
- struct vma_swap_readahead *ra_info)
+static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
+ unsigned long *end)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long ra_val;
- unsigned long faddr, pfn, fpfn, lpfn, rpfn;
- unsigned long start, end;
+ unsigned long faddr, prev_faddr, left, right;
unsigned int max_win, hits, prev_win, win;
- max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
- SWAP_RA_ORDER_CEILING);
- if (max_win == 1) {
- ra_info->win = 1;
- return;
- }
+ max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING);
+ if (max_win == 1)
+ return 1;
faddr = vmf->address;
- fpfn = PFN_DOWN(faddr);
ra_val = GET_SWAP_RA_VAL(vma);
- pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
+ prev_faddr = SWAP_RA_ADDR(ra_val);
prev_win = SWAP_RA_WIN(ra_val);
hits = SWAP_RA_HITS(ra_val);
- ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
- max_win, prev_win);
- atomic_long_set(&vma->swap_readahead_info,
- SWAP_RA_VAL(faddr, win, 0));
+ win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits,
+ max_win, prev_win);
+ atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0));
if (win == 1)
- return;
-
- if (fpfn == pfn + 1) {
- lpfn = fpfn;
- rpfn = fpfn + win;
- } else if (pfn == fpfn + 1) {
- lpfn = fpfn - win + 1;
- rpfn = fpfn + 1;
- } else {
- unsigned int left = (win - 1) / 2;
-
- lpfn = fpfn - left;
- rpfn = fpfn + win - left;
- }
- start = max3(lpfn, PFN_DOWN(vma->vm_start),
- PFN_DOWN(faddr & PMD_MASK));
- end = min3(rpfn, PFN_DOWN(vma->vm_end),
- PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+ return 1;
- ra_info->nr_pte = end - start;
- ra_info->offset = fpfn - start;
+ if (faddr == prev_faddr + PAGE_SIZE)
+ left = faddr;
+ else if (prev_faddr == faddr + PAGE_SIZE)
+ left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE;
+ else
+ left = faddr - (((win - 1) / 2) << PAGE_SHIFT);
+ right = left + (win << PAGE_SHIFT);
+ if ((long)left < 0)
+ left = 0;
+ *start = max3(left, vma->vm_start, faddr & PMD_MASK);
+ *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE);
+
+ return win;
}
/**
@@ -819,24 +732,20 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
struct swap_iocb *splug = NULL;
struct folio *folio;
pte_t *pte = NULL, pentry;
- unsigned long addr;
+ int win;
+ unsigned long start, end, addr;
swp_entry_t entry;
pgoff_t ilx;
- unsigned int i;
bool page_allocated;
- struct vma_swap_readahead ra_info = {
- .win = 1,
- };
- swap_ra_info(vmf, &ra_info);
- if (ra_info.win == 1)
+ win = swap_vma_ra_win(vmf, &start, &end);
+ if (win == 1)
goto skip;
- addr = vmf->address - (ra_info.offset * PAGE_SIZE);
- ilx = targ_ilx - ra_info.offset;
+ ilx = targ_ilx - PFN_DOWN(vmf->address - start);
blk_start_plug(&plug);
- for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) {
+ for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) {
if (!pte++) {
pte = pte_offset_map(vmf->pmd, addr);
if (!pte)
@@ -855,8 +764,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
if (!folio)
continue;
if (page_allocated) {
- swap_read_folio(folio, false, &splug);
- if (i != ra_info.offset) {
+ swap_read_folio(folio, &splug);
+ if (addr != vmf->address) {
folio_set_readahead(folio);
count_vm_event(SWAP_RA);
}
@@ -872,10 +781,8 @@ skip:
/* The folio was likely read above, so no need for plugging here */
folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
&page_allocated, false);
- if (unlikely(page_allocated)) {
- zswap_folio_swapin(folio);
- swap_read_folio(folio, false, NULL);
- }
+ if (unlikely(page_allocated))
+ swap_read_folio(folio, NULL);
return folio;
}
@@ -885,13 +792,13 @@ skip:
* @gfp_mask: memory allocation flags
* @vmf: fault information
*
- * Returns the struct page for entry and addr, after queueing swapin.
+ * Returns the struct folio for entry and addr, after queueing swapin.
*
* It's a main entry function for swap readahead. By the configuration,
* it will read ahead blocks by cluster-based(ie, physical disk based)
* or vma-based(ie, virtual address based on faulty address) readahead.
*/
-struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
struct mempolicy *mpol;
@@ -904,17 +811,14 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
mpol_cond_put(mpol);
- if (!folio)
- return NULL;
- return folio_file_page(folio, swp_offset(entry));
+ return folio;
}
#ifdef CONFIG_SYSFS
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%s\n",
- enable_vma_readahead ? "true" : "false");
+ return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead));
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,