summaryrefslogtreecommitdiff
path: root/mm/swap_state.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 10:28:11 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 10:28:11 -0700
commit6e17c6de3ddf3073741d9c91a796ee696914d8a0 (patch)
tree2c425707f78642625dbe2c824c7fded2021e3dc7 /mm/swap_state.c
parent6aeadf7896bff4ca230702daba8788455e6b866e (diff)
parentacc72d59c7509540c27c49625cb4b5a8db1f1a84 (diff)
Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull mm updates from Andrew Morton: - Yosry Ahmed brought back some cgroup v1 stats in OOM logs - Yosry has also eliminated cgroup's atomic rstat flushing - Nhat Pham adds the new cachestat() syscall. It provides userspace with the ability to query pagecache status - a similar concept to mincore() but more powerful and with improved usability - Mel Gorman provides more optimizations for compaction, reducing the prevalence of page rescanning - Lorenzo Stoakes has done some maintanance work on the get_user_pages() interface - Liam Howlett continues with cleanups and maintenance work to the maple tree code. Peng Zhang also does some work on maple tree - Johannes Weiner has done some cleanup work on the compaction code - David Hildenbrand has contributed additional selftests for get_user_pages() - Thomas Gleixner has contributed some maintenance and optimization work for the vmalloc code - Baolin Wang has provided some compaction cleanups, - SeongJae Park continues maintenance work on the DAMON code - Huang Ying has done some maintenance on the swap code's usage of device refcounting - Christoph Hellwig has some cleanups for the filemap/directio code - Ryan Roberts provides two patch series which yield some rationalization of the kernel's access to pte entries - use the provided APIs rather than open-coding accesses - Lorenzo Stoakes has some fixes to the interaction between pagecache and directio access to file mappings - John Hubbard has a series of fixes to the MM selftesting code - ZhangPeng continues the folio conversion campaign - Hugh Dickins has been working on the pagetable handling code, mainly with a view to reducing the load on the mmap_lock - Catalin Marinas has reduced the arm64 kmalloc() minimum alignment from 128 to 8 - Domenico Cerasuolo has improved the zswap reclaim mechanism by reorganizing the LRU management - Matthew Wilcox provides some fixups to make gfs2 work better with the buffer_head code - Vishal Moola also has done some folio conversion work - Matthew Wilcox has removed the remnants of the pagevec code - their functionality is migrated over to struct folio_batch * tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits) mm/hugetlb: remove hugetlb_set_page_subpool() mm: nommu: correct the range of mmap_sem_read_lock in task_mem() hugetlb: revert use of page_cache_next_miss() Revert "page cache: fix page_cache_next/prev_miss off by one" mm/vmscan: fix root proactive reclaim unthrottling unbalanced node mm: memcg: rename and document global_reclaim() mm: kill [add|del]_page_to_lru_list() mm: compaction: convert to use a folio in isolate_migratepages_block() mm: zswap: fix double invalidate with exclusive loads mm: remove unnecessary pagevec includes mm: remove references to pagevec mm: rename invalidate_mapping_pagevec to mapping_try_invalidate mm: remove struct pagevec net: convert sunrpc from pagevec to folio_batch i915: convert i915_gpu_error to use a folio_batch pagevec: rename fbatch_count() mm: remove check_move_unevictable_pages() drm: convert drm_gem_put_pages() to use a folio_batch i915: convert shmem_sg_free_table() to use a folio_batch scatterlist: add sg_set_folio() ...
Diffstat (limited to 'mm/swap_state.c')
-rw-r--r--mm/swap_state.c87
1 files changed, 50 insertions, 37 deletions
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b76a65ac28b3..f8ea7015bad4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -16,7 +16,6 @@
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
-#include <linux/pagevec.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
@@ -275,9 +274,9 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
}
}
-/*
- * If we are the only user, then try to free up the swap cache.
- *
+/*
+ * If we are the only user, then try to free up the swap cache.
+ *
* Its ok to check the swapcache flag without the folio lock
* here because we are going to recheck again inside
* folio_free_swap() _with_ the lock.
@@ -294,7 +293,7 @@ void free_swap_cache(struct page *page)
}
}
-/*
+/*
* Perform a free_page(), also freeing any swap cache associated with
* this page if it is the last user of the page.
*/
@@ -417,9 +416,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
{
struct swap_info_struct *si;
struct folio *folio;
+ struct page *page;
void *shadow = NULL;
*new_page_allocated = false;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
for (;;) {
int err;
@@ -428,14 +431,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* called after swap_cache_get_folio() failed, re-calling
* that would confuse statistics.
*/
- si = get_swap_device(entry);
- if (!si)
- return NULL;
folio = filemap_get_folio(swap_address_space(entry),
swp_offset(entry));
- put_swap_device(si);
- if (!IS_ERR(folio))
- return folio_file_page(folio, swp_offset(entry));
+ if (!IS_ERR(folio)) {
+ page = folio_file_page(folio, swp_offset(entry));
+ goto got_page;
+ }
/*
* Just skip read ahead for unused swap slot.
@@ -445,8 +446,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* as SWAP_HAS_CACHE. That's done in later part of code or
* else swap_off will be aborted if we return NULL.
*/
- if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
- return NULL;
+ if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
+ goto fail_put_swap;
/*
* Get a new page to read into from swap. Allocate it now,
@@ -455,7 +456,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*/
folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
if (!folio)
- return NULL;
+ goto fail_put_swap;
/*
* Swap entry may have been freed since our caller observed it.
@@ -466,7 +467,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
folio_put(folio);
if (err != -EEXIST)
- return NULL;
+ goto fail_put_swap;
/*
* We might race against __delete_from_swap_cache(), and
@@ -500,12 +501,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/* Caller will initiate read into locked folio */
folio_add_lru(folio);
*new_page_allocated = true;
- return &folio->page;
+ page = &folio->page;
+got_page:
+ put_swap_device(si);
+ return page;
fail_unlock:
put_swap_folio(folio, entry);
folio_unlock(folio);
folio_put(folio);
+fail_put_swap:
+ put_swap_device(si);
return NULL;
}
@@ -514,6 +520,10 @@ fail_unlock:
* and reading the disk if it is not already cached.
* A failure return means that either the page allocation failed or that
* the swap entry is no longer in use.
+ *
+ * get/put_swap_device() aren't needed to call this function, because
+ * __read_swap_cache_async() call them and swap_readpage() holds the
+ * swap cache folio lock.
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma,
@@ -698,6 +708,14 @@ void exit_swap_address_space(unsigned int type)
swapper_spaces[type] = NULL;
}
+#define SWAP_RA_ORDER_CEILING 5
+
+struct vma_swap_readahead {
+ unsigned short win;
+ unsigned short offset;
+ unsigned short nr_pte;
+};
+
static void swap_ra_info(struct vm_fault *vmf,
struct vma_swap_readahead *ra_info)
{
@@ -705,11 +723,7 @@ static void swap_ra_info(struct vm_fault *vmf,
unsigned long ra_val;
unsigned long faddr, pfn, fpfn, lpfn, rpfn;
unsigned long start, end;
- pte_t *pte, *orig_pte;
unsigned int max_win, hits, prev_win, win;
-#ifndef CONFIG_64BIT
- pte_t *tpte;
-#endif
max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
SWAP_RA_ORDER_CEILING);
@@ -728,12 +742,9 @@ static void swap_ra_info(struct vm_fault *vmf,
max_win, prev_win);
atomic_long_set(&vma->swap_readahead_info,
SWAP_RA_VAL(faddr, win, 0));
-
if (win == 1)
return;
- /* Copy the PTEs because the page table may be unmapped */
- orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
if (fpfn == pfn + 1) {
lpfn = fpfn;
rpfn = fpfn + win;
@@ -753,15 +764,6 @@ static void swap_ra_info(struct vm_fault *vmf,
ra_info->nr_pte = end - start;
ra_info->offset = fpfn - start;
- pte -= ra_info->offset;
-#ifdef CONFIG_64BIT
- ra_info->ptes = pte;
-#else
- tpte = ra_info->ptes;
- for (pfn = start; pfn != end; pfn++)
- *tpte++ = *pte++;
-#endif
- pte_unmap(orig_pte);
}
/**
@@ -785,7 +787,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct swap_iocb *splug = NULL;
struct vm_area_struct *vma = vmf->vma;
struct page *page;
- pte_t *pte, pentry;
+ pte_t *pte = NULL, pentry;
+ unsigned long addr;
swp_entry_t entry;
unsigned int i;
bool page_allocated;
@@ -797,17 +800,25 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
if (ra_info.win == 1)
goto skip;
+ addr = vmf->address - (ra_info.offset * PAGE_SIZE);
+
blk_start_plug(&plug);
- for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
- i++, pte++) {
- pentry = *pte;
+ for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) {
+ if (!pte++) {
+ pte = pte_offset_map(vmf->pmd, addr);
+ if (!pte)
+ break;
+ }
+ pentry = ptep_get_lockless(pte);
if (!is_swap_pte(pentry))
continue;
entry = pte_to_swp_entry(pentry);
if (unlikely(non_swap_entry(entry)))
continue;
+ pte_unmap(pte);
+ pte = NULL;
page = __read_swap_cache_async(entry, gfp_mask, vma,
- vmf->address, &page_allocated);
+ addr, &page_allocated);
if (!page)
continue;
if (page_allocated) {
@@ -819,6 +830,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
}
put_page(page);
}
+ if (pte)
+ pte_unmap(pte);
blk_finish_plug(&plug);
swap_read_unplug(splug);
lru_add_drain();