From 7a8817f2c96e98d5e65a59e34ba9ea1ff6ed23bc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Jul 2023 19:56:43 +0800 Subject: mm: memory-failure: add PageOffline() check Memory failure is not interested in logically offlined pages. Skip this type of page. Link: https://lkml.kernel.org/r/20230727115643.639741-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory-failure.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 881c35ef1daa..ed29d720ee76 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1562,7 +1562,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * Here we are interested only in user-mapped pages, so skip any * other types of pages. */ - if (PageReserved(p) || PageSlab(p) || PageTable(p)) + if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p)) return true; if (!(PageLRU(hpage) || PageHuge(p))) return true; @@ -2533,7 +2533,8 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio)) + if (folio_test_slab(folio) || PageTable(&folio->page) || + folio_test_reserved(folio) || PageOffline(&folio->page)) goto unlock_mutex; /* -- cgit From 6885938c349c14b277305cd1129e7eb14f3e2c55 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Thu, 13 Jul 2023 23:55:53 +0000 Subject: mm/hwpoison: rename hwp_walk* to hwpoison_walk* In the discussion of "Improve hugetlbfs read on HWPOISON hugepages" [1], Matthew Wilcox suggests hwp is a bad abbreviation of hwpoison, as hwp is already used as "an acronym by acpi, intel_pstate, some clock drivers, an ethernet driver, and a scsi driver"[1]. So rename hwp_walk and hwp_walk_ops to hwpoison_walk and hwpoison_walk_ops respectively. raw_hwp_(page|list), *_raw_hwp, and raw_hwp_unreliable flag are other major appearances of "hwp". However, given the "raw" hint in the name, it is easy to differentiate them from other "hwp" acronyms. Since renaming them is not as straightforward as renaming hwp_walk*, they are not covered by this commit. [1] https://lore.kernel.org/lkml/20230707201904.953262-5-jiaqiyan@google.com/T/#me6fecb8ce1ad4d5769199c9e162a44bc88f7bdec Link: https://lkml.kernel.org/r/20230713235553.4121855-1-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Reviewed-by: Anshuman Khandual Acked-by: Naoya Horiguchi Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ed29d720ee76..7b01fffe7a79 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -717,7 +717,7 @@ static void collect_procs(struct page *page, struct list_head *tokill, collect_procs_file(page, tokill, force_early); } -struct hwp_walk { +struct hwpoison_walk { struct to_kill tk; unsigned long pfn; int flags; @@ -752,7 +752,7 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, #ifdef CONFIG_TRANSPARENT_HUGEPAGE static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, - struct hwp_walk *hwp) + struct hwpoison_walk *hwp) { pmd_t pmd = *pmdp; unsigned long pfn; @@ -770,7 +770,7 @@ static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, } #else static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, - struct hwp_walk *hwp) + struct hwpoison_walk *hwp) { return 0; } @@ -779,7 +779,7 @@ static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hwp_walk *hwp = walk->private; + struct hwpoison_walk *hwp = walk->private; int ret = 0; pte_t *ptep, *mapped_pte; spinlock_t *ptl; @@ -813,7 +813,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hwp_walk *hwp = walk->private; + struct hwpoison_walk *hwp = walk->private; pte_t pte = huge_ptep_get(ptep); struct hstate *h = hstate_vma(walk->vma); @@ -824,7 +824,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, #define hwpoison_hugetlb_range NULL #endif -static const struct mm_walk_ops hwp_walk_ops = { +static const struct mm_walk_ops hwpoison_walk_ops = { .pmd_entry = hwpoison_pte_range, .hugetlb_entry = hwpoison_hugetlb_range, .walk_lock = PGWALK_RDLOCK, @@ -847,7 +847,7 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, int flags) { int ret; - struct hwp_walk priv = { + struct hwpoison_walk priv = { .pfn = pfn, }; priv.tk.tsk = p; @@ -856,7 +856,7 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, return -EFAULT; mmap_read_lock(p->mm); - ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, + ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwpoison_walk_ops, (void *)&priv); if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); -- cgit From 8b47933544a68a62a9c4e35f8d8a6d2a2c935823 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Tue, 22 Aug 2023 11:05:39 -0700 Subject: proc/ksm: add ksm stats to /proc/pid/smaps With madvise and prctl KSM can be enabled for different VMA's. Once it is enabled we can query how effective KSM is overall. However we cannot easily query if an individual VMA benefits from KSM. This commit adds a KSM section to the /prod//smaps file. It reports how many of the pages are KSM pages. Note that KSM-placed zeropages are not included, only actual KSM pages. Here is a typical output: 7f420a000000-7f421a000000 rw-p 00000000 00:00 0 Size: 262144 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 51212 kB Pss: 8276 kB Shared_Clean: 172 kB Shared_Dirty: 42996 kB Private_Clean: 196 kB Private_Dirty: 7848 kB Referenced: 15388 kB Anonymous: 51212 kB KSM: 41376 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 202016 kB SwapPss: 3882 kB Locked: 0 kB THPeligible: 0 ProtectionKey: 0 ksm_state: 0 ksm_skip_base: 0 ksm_skip_count: 0 VmFlags: rd wr mr mw me nr mg anon This information also helps with the following workflow: - First enable KSM for all the VMA's of a process with prctl. - Then analyze with the above smaps report which VMA's benefit the most - Change the application (if possible) to add the corresponding madvise calls for the VMA's that benefit the most [shr@devkernel.io: v5] Link: https://lkml.kernel.org/r/20230823170107.1457915-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20230822180539.1424843-1-shr@devkernel.io Signed-off-by: Stefan Roesch Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 4 ++++ fs/proc/task_mmu.c | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 7897a7dafcbc..d49af173b5af 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -461,6 +461,7 @@ Memory Area, or VMA) there is a series of lines such as the following:: Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB + KSM: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB @@ -501,6 +502,9 @@ accessed. a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. +"KSM" reports how many of the pages are KSM pages. Note that KSM-placed zeropages +are not included, only actual KSM pages. + "LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE). The memory isn't freed immediately with madvise(). It's freed in memory pressure if the memory is clean. Please note that the printed value might diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 15ddf4653a19..5fc732cb2350 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -396,6 +397,7 @@ struct mem_size_stats { unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; + unsigned long ksm; u64 pss; u64 pss_anon; u64 pss_file; @@ -452,6 +454,9 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, mss->lazyfree += size; } + if (PageKsm(page)) + mss->ksm += size; + mss->resident += size; /* Accumulate the size in pages that have been accessed. */ if (young || page_is_young(page) || PageReferenced(page)) @@ -822,6 +827,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); -- cgit From 6ad11bc6ed37c6371e9e13619862709b6529b43a Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 24 Aug 2023 14:38:08 +0300 Subject: rmap: remove anon_vma_link() nommu stub anon_vma_link() is unused since commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process server scalability issue"). Link: https://lkml.kernel.org/r/cdce9b00c9ab15f6d02eddf40dcad537d3e9676f.1692877089.git.baruch@tkos.co.il Signed-off-by: Baruch Siach Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/rmap.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index a3825ce81102..51cc21ebb568 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -479,7 +479,6 @@ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, #define anon_vma_init() do {} while (0) #define anon_vma_prepare(vma) (0) -#define anon_vma_link(vma) do {} while (0) static inline int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, -- cgit From 12af80f6c9f2daf07bc3125605dc2e454db321a5 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 24 Aug 2023 14:38:09 +0300 Subject: MAINTAINERS: add rmap.h to mm entry Make it easier to figure out where to send patches for this file. Link: https://lkml.kernel.org/r/efbc7689d35a48ff402644d696aa9a8d8bb6333a.1692877089.git.baruch@tkos.co.il Signed-off-by: Baruch Siach Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6849f4994787..70f773562053 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13622,6 +13622,7 @@ F: include/linux/memory_hotplug.h F: include/linux/mm.h F: include/linux/mmzone.h F: include/linux/pagewalk.h +F: include/linux/rmap.h F: include/trace/events/ksm.h F: mm/ F: tools/mm/ -- cgit From f945116e4e191cd543ecd56d9f13e6331494847c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 24 Aug 2023 11:38:21 -0400 Subject: mm: page_alloc: remove stale CMA guard code In the past, movable allocations could be disallowed from CMA through PF_MEMALLOC_PIN. As CMA pages are funneled through the MOVABLE pcplist, this required filtering that cornercase during allocations, such that pinnable allocations wouldn't accidentally get a CMA page. However, since 8e3560d963d2 ("mm: honor PF_MEMALLOC_PIN for all movable pages"), PF_MEMALLOC_PIN automatically excludes __GFP_MOVABLE. Once again, MOVABLE implies CMA is allowed. Remove the stale filtering code. Also remove a stale comment that was introduced as part of the filtering code, because the filtering let order-0 pages fall through to the buddy allocator. See 1d91df85f399 ("mm/page_alloc: handle a missing case for memalloc_nocma_{save/restore} APIs") for context. The comment's been obsolete since the introduction of the explicit ALLOC_HIGHATOMIC flag in eb2e2b425c69 ("mm/page_alloc: explicitly record high-order atomic allocations in alloc_flags"). Link: https://lkml.kernel.org/r/20230824153821.243148-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Joonsoo Kim Cc: Miaohe Lin Cc: Pasha Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 452459836b71..0c5be12f9336 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2641,12 +2641,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, do { page = NULL; spin_lock_irqsave(&zone->lock, flags); - /* - * order-0 request can reach here when the pcplist is skipped - * due to non-CMA allocation context. HIGHATOMIC area is - * reserved for high-order atomic allocation, so order-0 - * request should skip it. - */ if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { @@ -2780,17 +2774,10 @@ struct page *rmqueue(struct zone *preferred_zone, WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); if (likely(pcp_allowed_order(order))) { - /* - * MIGRATE_MOVABLE pcplist could have the pages on CMA area and - * we need to skip it when CMA area isn't allowed. - */ - if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || - migratetype != MIGRATE_MOVABLE) { - page = rmqueue_pcplist(preferred_zone, zone, order, - migratetype, alloc_flags); - if (likely(page)) - goto out; - } + page = rmqueue_pcplist(preferred_zone, zone, order, + migratetype, alloc_flags); + if (likely(page)) + goto out; } page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, -- cgit From e68d343d2720779362cb7160cb7f4bd24979b2b4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 25 Aug 2023 12:49:47 -0400 Subject: mm/kmemleak: move up cond_resched() call in page scanning loop Commit bde5f6bc68db ("kmemleak: add scheduling point to kmemleak_scan()") added a cond_resched() call to the struct page scanning loop to prevent soft lockup from happening. However, soft lockup can still happen in that loop in some corner cases when the pages that satisfy the "!(pfn & 63)" check are skipped for some reasons. Fix this corner case by moving up the cond_resched() check so that it will be called every 64 pages unconditionally. Link: https://lkml.kernel.org/r/20230825164947.1317981-1-longman@redhat.com Fixes: bde5f6bc68db ("kmemleak: add scheduling point to kmemleak_scan()") Signed-off-by: Waiman Long Cc: Catalin Marinas Cc: Yisheng Xie Signed-off-by: Andrew Morton --- mm/kmemleak.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 2918150e31bd..54c2c90d3abc 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1584,6 +1584,9 @@ static void kmemleak_scan(void) for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page = pfn_to_online_page(pfn); + if (!(pfn & 63)) + cond_resched(); + if (!page) continue; @@ -1594,8 +1597,6 @@ static void kmemleak_scan(void) if (page_count(page) == 0) continue; scan_block(page, page + 1, NULL); - if (!(pfn & 63)) - cond_resched(); } } put_online_mems(); -- cgit