From 07f44ac3c90c50a201307d3fe4dda120ee8394f5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 16 May 2023 14:38:18 +0800 Subject: mm: page_alloc: move pm_* function into power pm_restrict_gfp_mask()/pm_restore_gfp_mask() only used in power, let's move them out of page_alloc.c. Adding a general gfp_has_io_fs() function which return true if gfp with both __GFP_IO and __GFP_FS flags, then use it inside of pm_suspended_storage(), also the pm_suspended_storage() is moved into suspend.h. Link: https://lkml.kernel.org/r/20230516063821.121844-11-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Iurii Zaikin Cc: Kees Cook Cc: Len Brown Cc: Luis Chamberlain Cc: Mike Rapoport (IBM) Cc: Oscar Salvador Cc: Pavel Machek Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- mm/swapfile.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 274bbf797480..c74259001d5e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include -- cgit From f9f956b550b8ce6fb902af9f29ba31b3e0fd052d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 29 May 2023 14:13:51 +0800 Subject: swap: remove get/put_swap_device() in __swap_count() Patch series "swap: cleanup get/put_swap_device() usage", v3. The general rule to use a swap entry is as follows. When we get a swap entry, if there aren't some other ways to prevent swapoff, such as the folio in swap cache is locked, page table lock is held, etc., the swap entry may become invalid because of swapoff. Then, we need to enclose all swap related functions with get_swap_device() and put_swap_device(), unless the swap functions call get/put_swap_device() by themselves. Based on the above rule, all get/put_swap_device() usage are checked and cleaned up if necessary. This patch (of 5): get/put_swap_device() are added to __swap_count() in commit eb085574a752 ("mm, swap: fix race between swapoff and some swap operations"). Later, in commit 2799e77529c2 ("swap: fix do_swap_page() race with swapoff"), get/put_swap_device() are added to do_swap_page(). And they enclose the only call site of __swap_count(). So, it's safe to remove get/put_swap_device() in __swap_count() now. Link: https://lkml.kernel.org/r/20230529061355.125791-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20230529061355.125791-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Yosry Ahmed Reviewed-by: David Hildenbrand Reviewed-by: Chris Li (Google) Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/swapfile.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index c74259001d5e..cf8b16b6a98e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1433,16 +1433,10 @@ void swapcache_free_entries(swp_entry_t *entries, int n) int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si; + struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); - int count = 0; - si = get_swap_device(entry); - if (si) { - count = swap_count(si->swap_map[offset]); - put_swap_device(si); - } - return count; + return swap_count(si->swap_map[offset]); } /* -- cgit From 3ecdeb0f876e91c4a7129ba2ba5baa530aa6c4f9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 29 May 2023 14:13:53 +0800 Subject: swap: remove __swp_swapcount() __swp_swapcount() just encloses the calling to swap_swapcount() with get/put_swap_device(). It is called in __read_swap_cache_async() only, which encloses the calling with get/put_swap_device() already. So, __read_swap_cache_async() can call swap_swapcount() directly. Link: https://lkml.kernel.org/r/20230529061355.125791-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: David Hildenbrand Reviewed-by: Chris Li (Google) Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Yang Shi Cc: Yu Zhao Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/swapfile.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index cf8b16b6a98e..2d264efe90d2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1444,7 +1444,7 @@ int __swap_count(swp_entry_t entry) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; @@ -1456,24 +1456,6 @@ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) return count; } -/* - * How many references to @entry are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. - */ -int __swp_swapcount(swp_entry_t entry) -{ - int count = 0; - struct swap_info_struct *si; - - si = get_swap_device(entry); - if (si) { - count = swap_swapcount(si, entry); - put_swap_device(si); - } - return count; -} - /* * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer. -- cgit From c07aee4f82af3c466509782b15658837fe53babc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 29 May 2023 14:13:54 +0800 Subject: swap: remove get/put_swap_device() in __swap_duplicate() __swap_duplicate() is called by - swap_shmem_alloc(): the folio in swap cache is locked. - copy_nonpresent_pte() -> swap_duplicate() and try_to_unmap_one() -> swap_duplicate(): the page table lock is held. - __read_swap_cache_async() -> swapcache_prepare(): enclosed with get/put_swap_device() in __read_swap_cache_async() already. So, it's safe to remove get/put_swap_device() in __swap_duplicate(). Link: https://lkml.kernel.org/r/20230529061355.125791-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Yosry Ahmed Reviewed-by: David Hildenbrand Reviewed-by: Chris Li (Google) Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/swapfile.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 2d264efe90d2..bd035677f196 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3265,9 +3265,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unsigned char has_cache; int err; - p = get_swap_device(entry); - if (!p) - return -EINVAL; + p = swp_swap_info(entry); offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); @@ -3314,7 +3312,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); - put_swap_device(p); return err; } -- cgit From a95722a047724ef75567381976a36f0e44230bd9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 29 May 2023 14:13:55 +0800 Subject: swap: comments get_swap_device() with usage rule The general rule to use a swap entry is as follows. When we get a swap entry, if there aren't some other ways to prevent swapoff, such as the folio in swap cache is locked, page table lock is held, etc., the swap entry may become invalid because of swapoff. Then, we need to enclose all swap related functions with get_swap_device() and put_swap_device(), unless the swap functions call get/put_swap_device() by themselves. Add the rule as comments of get_swap_device(). Link: https://lkml.kernel.org/r/20230529061355.125791-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: David Hildenbrand Reviewed-by: Yosry Ahmed Reviewed-by: Chris Li (Google) Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/swapfile.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index bd035677f196..df312534e239 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1220,6 +1220,13 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, } /* + * When we get a swap entry, if there aren't some other ways to + * prevent swapoff, such as the folio in swap cache is locked, page + * table lock is held, etc., the swap entry may become invalid because + * of swapoff. Then, we need to enclose all swap related functions + * with get_swap_device() and put_swap_device(), unless the swap + * functions call get/put_swap_device() by themselves. + * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until @@ -1228,9 +1235,8 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, * Notice that swapoff or swapoff+swapon can still happen before the * percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way - * to prevent swapoff, such as page lock, page table lock, etc. The - * caller must be prepared for that. For example, the following - * situation is possible. + * to prevent swapoff. The caller must be prepared for that. For + * example, the following situation is possible. * * CPU1 CPU2 * do_swap_page() -- cgit From d850fa729873787a4030eed5fd875d00eb63946b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 8 Jun 2023 18:36:11 -0700 Subject: mm/swapoff: allow pte_offset_map[_lock]() to fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adjust unuse_pte() and unuse_pte_range() to allow pte_offset_map_lock() and pte_offset_map() failure; remove pmd_none_or_trans_huge_or_clear_bad() from unuse_pmd_range() now that pte_offset_map() does all that itself. Link: https://lkml.kernel.org/r/c4d831-13c3-9dfd-70c2-64514ad951fd@google.com Signed-off-by: Hugh Dickins Cc: Alistair Popple Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christophe Leroy Cc: Christoph Hellwig Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Ryan Roberts Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Signed-off-by: Andrew Morton --- mm/swapfile.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index df312534e239..74dd4d2337b7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1757,7 +1757,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, hwposioned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!pte || !pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { ret = 0; goto out; } @@ -1810,7 +1810,8 @@ setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: - pte_unmap_unlock(pte, ptl); + if (pte) + pte_unmap_unlock(pte, ptl); if (page != swapcache) { unlock_page(page); put_page(page); @@ -1822,17 +1823,22 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { - swp_entry_t entry; - pte_t *pte; + pte_t *pte = NULL; struct swap_info_struct *si; - int ret = 0; si = swap_info[type]; - pte = pte_offset_map(pmd, addr); do { struct folio *folio; unsigned long offset; unsigned char swp_count; + swp_entry_t entry; + int ret; + + if (!pte++) { + pte = pte_offset_map(pmd, addr); + if (!pte) + break; + } if (!is_swap_pte(*pte)) continue; @@ -1843,6 +1849,8 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, offset = swp_offset(entry); pte_unmap(pte); + pte = NULL; + folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { struct page *page; @@ -1861,8 +1869,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!folio) { swp_count = READ_ONCE(si->swap_map[offset]); if (swp_count == 0 || swp_count == SWAP_MAP_BAD) - goto try_next; - + continue; return -ENOMEM; } @@ -1872,20 +1879,17 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (ret < 0) { folio_unlock(folio); folio_put(folio); - goto out; + return ret; } folio_free_swap(folio); folio_unlock(folio); folio_put(folio); -try_next: - pte = pte_offset_map(pmd, addr); - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); + } while (addr += PAGE_SIZE, addr != end); - ret = 0; -out: - return ret; + if (pte) + pte_unmap(pte); + return 0; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, @@ -1900,8 +1904,6 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, do { cond_resched(); next = pmd_addr_end(addr, end); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - continue; ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; -- cgit From c33c794828f21217f72ce6fc140e0d34e0d56bff Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 12 Jun 2023 16:15:45 +0100 Subject: mm: ptep_get() conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert all instances of direct pte_t* dereferencing to instead use ptep_get() helper. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. Conversion was done using Coccinelle: ---- // $ make coccicheck \ // COCCI=ptepget.cocci \ // SPFLAGS="--include-headers" \ // MODE=patch virtual patch @ depends on patch @ pte_t *v; @@ - *v + ptep_get(v) ---- Then reviewed and hand-edited to avoid multiple unnecessary calls to ptep_get(), instead opting to store the result of a single call in a variable, where it is correct to do so. This aims to negate any cost of READ_ONCE() and will benefit arch-overrides that may be more complex. Included is a fix for an issue in an earlier version of this patch that was pointed out by kernel test robot. The issue arose because config MMU=n elides definition of the ptep helper functions, including ptep_get(). HUGETLB_PAGE=n configs still define a simple huge_ptep_clear_flush() for linking purposes, which dereferences the ptep. So when both configs are disabled, this caused a build error because ptep_get() is not defined. Fix by continuing to do a direct dereference when MMU=n. This is safe because for this config the arch code cannot be trying to virtualize the ptes because none of the ptep helpers are defined. Link: https://lkml.kernel.org/r/20230612151545.3317766-4-ryan.roberts@arm.com Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202305120142.yXsNEo6H-lkp@intel.com/ Signed-off-by: Ryan Roberts Cc: Adrian Hunter Cc: Alexander Potapenko Cc: Alexander Shishkin Cc: Alex Williamson Cc: Al Viro Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christian Brauner Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dave Airlie Cc: Dimitri Sivanich Cc: Dmitry Vyukov Cc: Ian Rogers Cc: Jason Gunthorpe Cc: Jérôme Glisse Cc: Jiri Olsa Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mark Rutland Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Namhyung Kim Cc: Naoya Horiguchi Cc: Oleksandr Tyshchenko Cc: Pavel Tatashin Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Uladzislau Rezki (Sony) Cc: Vincenzo Frascino Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/swapfile.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 74dd4d2337b7..a6945c2e0d03 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1745,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; - pte_t *pte, new_pte; + pte_t *pte, new_pte, old_pte; bool hwposioned = false; int ret = 1; @@ -1757,11 +1757,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, hwposioned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte || !pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), + swp_entry_to_pte(entry)))) { ret = 0; goto out; } + old_pte = ptep_get(pte); + if (unlikely(hwposioned || !PageUptodate(page))) { swp_entry_t swp_entry; @@ -1793,7 +1796,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * call and have the page locked. */ VM_BUG_ON_PAGE(PageWriteback(page), page); - if (pte_swp_exclusive(*pte)) + if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; page_add_anon_rmap(page, vma, addr, rmap_flags); @@ -1802,9 +1805,9 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, lru_cache_add_inactive_or_unevictable(page, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pte)) + if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); - if (pte_swp_uffd_wp(*pte)) + if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); @@ -1833,6 +1836,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned char swp_count; swp_entry_t entry; int ret; + pte_t ptent; if (!pte++) { pte = pte_offset_map(pmd, addr); @@ -1840,10 +1844,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, break; } - if (!is_swap_pte(*pte)) + ptent = ptep_get_lockless(pte); + + if (!is_swap_pte(ptent)) continue; - entry = pte_to_swp_entry(*pte); + entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; -- cgit From 3fda49e89f1702df6bb6a2470076b1a7bf3a29de Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 19 Jun 2023 23:50:00 -0700 Subject: mm/swapfile: delete outdated pte_offset_map() comment Delete a triply out-of-date comment from add_swap_count_continuation(): 1. vmalloc_to_page() changed from pte_offset_map() to pte_offset_kernel() 2. pte_offset_map() changed from using kmap_atomic() to kmap_local_page() 3. kmap_atomic() changed from using fixed FIX_KMAP addresses in 2.6.37. Link: https://lkml.kernel.org/r/9022632b-ba9d-8cb0-c25-4be9786481b5@google.com Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton --- mm/swapfile.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index a6945c2e0d03..92ed7cba2268 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3456,11 +3456,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) goto out; } - /* - * We are fortunate that although vmalloc_to_page uses pte_offset_map, - * no architecture is using highmem pages for kernel page tables: so it - * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. - */ head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; -- cgit