From be45a4902c7caa717fee6b2f671e59b396ed395c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 12:13:30 -0400 Subject: mm/swap: cache maximum swapfile size when init swap We used to have swapfile_maximum_size() fetching a maximum value of swapfile size per-arch. As the caller of max_swapfile_size() grows, this patch introduce a variable "swapfile_maximum_size" and cache the value of old max_swapfile_size(), so that we don't need to calculate the value every time. Caching the value in swapfile_init() is safe because when reaching the phase we should have initialized all the relevant information. Here the major arch to take care of is x86, which defines the max swapfile size based on L1TF mitigation. Here both X86_BUG_L1TF or l1tf_mitigation should have been setup properly when reaching swapfile_init(). As a reference, the code path looks like this for x86: - start_kernel - setup_arch - early_cpu_init - early_identify_cpu --> setup X86_BUG_L1TF - parse_early_param - l1tf_cmdline --> set l1tf_mitigation - check_bugs - l1tf_select_mitigation --> set l1tf_mitigation - arch_call_rest_init - rest_init - kernel_init - kernel_init_freeable - do_basic_setup - do_initcalls --> calls swapfile_init() (initcall level 4) The swapfile size only depends on swp pte format on non-x86 archs, so caching it is safe too. Since at it, rename max_swapfile_size() to arch_max_swapfile_size() because arch can define its own function, so it's more straightforward to have "arch_" as its prefix. At the meantime, export swapfile_maximum_size to replace the old usages of max_swapfile_size(). [peterx@redhat.com: declare arch_max_swapfile_size) in swapfile.h] Link: https://lkml.kernel.org/r/YxTh1GuC6ro5fKL5@xz-m1.local Link: https://lkml.kernel.org/r/20220811161331.37055-7-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: "Huang, Ying" Cc: Alistair Popple Cc: Andi Kleen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Minchan Kim Cc: Nadav Amit Cc: Vlastimil Babka Cc: Dave Hansen Signed-off-by: Andrew Morton --- mm/swapfile.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 1fdccd2f1422..3cc64399df44 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -63,6 +63,7 @@ EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority = -1; +unsigned long swapfile_maximum_size; static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; @@ -2816,7 +2817,7 @@ unsigned long generic_max_swapfile_size(void) } /* Can be overridden by an architecture for additional checks. */ -__weak unsigned long max_swapfile_size(void) +__weak unsigned long arch_max_swapfile_size(void) { return generic_max_swapfile_size(); } @@ -2856,7 +2857,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, p->cluster_next = 1; p->cluster_nr = 0; - maxpages = max_swapfile_size(); + maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; if (!last_page) { pr_warn("Empty swap-file\n"); @@ -3677,6 +3678,8 @@ static int __init swapfile_init(void) for_each_node(nid) plist_head_init(&swap_avail_heads[nid]); + swapfile_maximum_size = arch_max_swapfile_size(); + return 0; } subsys_initcall(swapfile_init); -- cgit From 5154e607967d3f587fda84a40abbf900275016c9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 12:13:31 -0400 Subject: mm/swap: cache swap migration A/D bits support Introduce a variable swap_migration_ad_supported to cache whether the arch supports swap migration A/D bits. Here one thing to mention is that SWP_MIG_TOTAL_BITS will internally reference the other macro MAX_PHYSMEM_BITS, which is a function call on x86 (constant on all the rest of archs). It's safe to reference it in swapfile_init() because when reaching here we're already during initcalls level 4 so we must have initialized 5-level pgtable for x86_64 (right after early_identify_cpu() finishes). - start_kernel - setup_arch - early_cpu_init - get_cpu_cap --> fetch from CPUID (including X86_FEATURE_LA57) - early_identify_cpu --> clear X86_FEATURE_LA57 (if early lvl5 not enabled (USE_EARLY_PGTABLE_L5)) - arch_call_rest_init - rest_init - kernel_init - kernel_init_freeable - do_basic_setup - do_initcalls --> calls swapfile_init() (initcall level 4) This should slightly speed up the migration swap entry handlings. Link: https://lkml.kernel.org/r/20220811161331.37055-8-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alistair Popple Cc: Andi Kleen Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Huang Ying Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Minchan Kim Cc: Nadav Amit Cc: Vlastimil Babka Cc: Dave Hansen Signed-off-by: Andrew Morton --- mm/swapfile.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 3cc64399df44..263b19e693cf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -64,6 +64,9 @@ EXPORT_SYMBOL_GPL(nr_swap_pages); long total_swap_pages; static int least_priority = -1; unsigned long swapfile_maximum_size; +#ifdef CONFIG_MIGRATION +bool swap_migration_ad_supported; +#endif /* CONFIG_MIGRATION */ static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; @@ -3680,6 +3683,11 @@ static int __init swapfile_init(void) swapfile_maximum_size = arch_max_swapfile_size(); +#ifdef CONFIG_MIGRATION + if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) + swap_migration_ad_supported = true; +#endif /* CONFIG_MIGRATION */ + return 0; } subsys_initcall(swapfile_init); -- cgit From 208c09db6d88f4442fb755d20cfb237a37a49f48 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:04 +0000 Subject: mm/swapfile: use vma iterator instead of vma linked list unuse_mm() no longer needs to reference the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-64-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/swapfile.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 263b19e693cf..469d9af86be2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1994,14 +1994,16 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->anon_vma) { ret = unuse_vma(vma, type); if (ret) break; } + cond_resched(); } mmap_read_unlock(mm); -- cgit From 14d01ee9fcb901c9e020f2dcd71c500f10c3bd03 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:05 +0100 Subject: mm/swapfile: remove page_swapcount() By restructuring folio_swapped(), it can use swap_swapcount() instead of page_swapcount(). It's even a little more efficient. Link: https://lkml.kernel.org/r/20220902194653.1739778-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 46 +++++++++++++--------------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 469d9af86be2..e0aaeac5c829 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1431,30 +1431,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n) spin_unlock(&p->lock); } -/* - * How many references to page are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. - */ -static int page_swapcount(struct page *page) -{ - int count = 0; - struct swap_info_struct *p; - struct swap_cluster_info *ci; - swp_entry_t entry; - unsigned long offset; - - entry.val = page_private(page); - p = _swap_info_get(entry); - if (p) { - offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(p, offset); - count = swap_count(p->swap_map[offset]); - unlock_cluster_or_swap_info(p, ci); - } - return count; -} - int __swap_count(swp_entry_t entry) { struct swap_info_struct *si; @@ -1469,11 +1445,16 @@ int __swap_count(swp_entry_t entry) return count; } +/* + * How many references to @entry are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { - int count = 0; pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; + int count; ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); @@ -1574,17 +1555,16 @@ unlock_out: static bool folio_swapped(struct folio *folio) { - swp_entry_t entry; - struct swap_info_struct *si; + swp_entry_t entry = folio_swap_entry(folio); + struct swap_info_struct *si = _swap_info_get(entry); + + if (!si) + return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) - return page_swapcount(&folio->page) != 0; + return swap_swapcount(si, entry) != 0; - entry = folio_swap_entry(folio); - si = _swap_info_get(entry); - if (si) - return swap_page_trans_huge_swapped(si, entry); - return false; + return swap_page_trans_huge_swapped(si, entry); } /* -- cgit From bdb0ed54a4768dc3c2613d4c45f94c887d43cd7a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:06 +0100 Subject: mm/swapfile: convert try_to_free_swap() to folio_free_swap() Add kernel-doc for folio_free_swap() and make it return bool. Add a try_to_free_swap() compatibility wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index e0aaeac5c829..f2a446799a39 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1567,43 +1567,47 @@ static bool folio_swapped(struct folio *folio) return swap_page_trans_huge_swapped(si, entry); } -/* - * If swap is getting full, or if there are no more mappings of this page, - * then try_to_free_swap is called to free its swap space. +/** + * folio_free_swap() - Free the swap space used for this folio. + * @folio: The folio to remove. + * + * If swap is getting full, or if there are no more mappings of this folio, + * then call folio_free_swap to free its swap space. + * + * Return: true if we were able to release the swap space. */ -int try_to_free_swap(struct page *page) +bool folio_free_swap(struct folio *folio) { - struct folio *folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) - return 0; + return false; if (folio_test_writeback(folio)) - return 0; + return false; if (folio_swapped(folio)) - return 0; + return false; /* * Once hibernation has begun to create its image of memory, - * there's a danger that one of the calls to try_to_free_swap() + * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free - * the swap from a page which has already been recorded in the - * image as a clean swapcache page, and then reuse its swap for + * the swap from a folio which has already been recorded in the + * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the - * original page might be freed under memory pressure, then + * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) - return 0; + return false; delete_from_swap_cache(folio); folio_set_dirty(folio); - return 1; + return true; } /* -- cgit From 4081f7446d95a9d3ced12dc04ff02c187a761e90 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:09 +0100 Subject: mm/swap: convert put_swap_page() to put_swap_folio() With all callers now using a folio, we can convert this function. Link: https://lkml.kernel.org/r/20220902194653.1739778-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index f2a446799a39..aafe739dc2a6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1332,7 +1332,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void put_swap_page(struct page *page, swp_entry_t entry) +void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1341,7 +1341,7 @@ void put_swap_page(struct page *page, swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; - int size = swap_entry_size(thp_nr_pages(page)); + int size = swap_entry_size(folio_nr_pages(folio)); si = _swap_info_get(entry); if (!si) -- cgit From 000085b9af9f3ca13dd672a753f815ac0cb45d0a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:30 +0100 Subject: swapfile: convert try_to_unuse() to use a folio Saves five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-35-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index aafe739dc2a6..23cdbe8e47cf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2032,7 +2032,7 @@ static int try_to_unuse(unsigned int type) struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; - struct page *page; + struct folio *folio; swp_entry_t entry; unsigned int i; @@ -2082,21 +2082,21 @@ retry: (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - page = find_get_page(swap_address_space(entry), i); - if (!page) + folio = filemap_get_folio(swap_address_space(entry), i); + if (!folio) continue; /* - * It is conceivable that a racing task removed this page from - * swap cache just before we acquired the page lock. The page + * It is conceivable that a racing task removed this folio from + * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But - * that is okay, try_to_free_swap() only removes stale pages. + * that is okay, folio_free_swap() only removes stale folios. */ - lock_page(page); - wait_on_page_writeback(page); - try_to_free_swap(page); - unlock_page(page); - put_page(page); + folio_lock(folio); + folio_wait_writeback(folio); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); } /* -- cgit From 2c3f6194b008b23e52a8e135bdd56b67fdaa55ca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:31 +0100 Subject: swapfile: convert __try_to_reclaim_swap() to use a folio Saves five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-36-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 23cdbe8e47cf..e3e1bd3d20b1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -132,27 +132,27 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); - struct page *page; + struct folio *folio; int ret = 0; - page = find_get_page(swap_address_space(entry), offset); - if (!page) + folio = filemap_get_folio(swap_address_space(entry), offset); + if (!folio) return 0; /* * When this function is called from scan_swap_map_slots() and it's - * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, + * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special - * case and you should use try_to_free_swap() with explicit lock_page() + * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ - if (trylock_page(page)) { + if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || - ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) - ret = try_to_free_swap(page); - unlock_page(page); + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(&folio->page))) + ret = folio_free_swap(folio); + folio_unlock(folio); } - put_page(page); + folio_put(folio); return ret; } -- cgit From f102cd8b173e066179b472fb6e3b18e31a1cc394 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:32 +0100 Subject: swapfile: convert unuse_pte_range() to use a folio Delay fetching the precise page from the folio until we're in unuse_pte(). Saves many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-37-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index e3e1bd3d20b1..3820b5ab64d9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1758,8 +1758,9 @@ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, swp_entry_t entry, struct page *page) + unsigned long addr, swp_entry_t entry, struct folio *folio) { + struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; pte_t *pte, new_pte; @@ -1831,17 +1832,18 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { - struct page *page; swp_entry_t entry; pte_t *pte; struct swap_info_struct *si; - unsigned long offset; int ret = 0; volatile unsigned char *swap_map; si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { + struct folio *folio; + unsigned long offset; + if (!is_swap_pte(*pte)) continue; @@ -1852,8 +1854,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, offset = swp_offset(entry); pte_unmap(pte); swap_map = &si->swap_map[offset]; - page = lookup_swap_cache(entry, vma, addr); - if (!page) { + folio = swap_cache_get_folio(entry, vma, addr); + if (!folio) { + struct page *page; struct vm_fault vmf = { .vma = vma, .address = addr, @@ -1863,25 +1866,27 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + if (page) + folio = page_folio(page); } - if (!page) { + if (!folio) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; return -ENOMEM; } - lock_page(page); - wait_on_page_writeback(page); - ret = unuse_pte(vma, pmd, addr, entry, page); + folio_lock(folio); + folio_wait_writeback(folio); + ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } - try_to_free_swap(page); - unlock_page(page); - put_page(page); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); try_next: pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); -- cgit From 9202d527b715f67bcdccbb9b712b65fe053f8109 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:43 +0100 Subject: memcg: convert mem_cgroup_swap_full() to take a folio All callers now have a folio, so convert the function to take a folio. Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-48-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 3820b5ab64d9..4efcfe34e45b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -148,7 +148,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(&folio->page))) + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) ret = folio_free_swap(folio); folio_unlock(folio); } -- cgit