From bbefa0fc04bab21e85f6b2ee7984c59694366f6a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:36 +0800 Subject: mm/compaction: use correct list in move_freelist_{head}/{tail} Patch series "Fixes and cleanups to compaction", v3. This is a series to do fix and clean up to compaction. Patch 1-2 fix and clean up freepage list operation. Patch 3-4 fix and clean up isolation of freepages Patch 7 factor code to check if compaction is needed for allocation order. More details can be found in respective patches. This patch (of 6): The freepage is chained with buddy_list in freelist head. Use buddy_list instead of lru to correct the list operation. Link: https://lkml.kernel.org/r/20230901155141.249860-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230901155141.249860-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 38c8d216c6a3..e3ee1bc1c0ad 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1395,8 +1395,8 @@ move_freelist_head(struct list_head *freelist, struct page *freepage) { LIST_HEAD(sublist); - if (!list_is_last(freelist, &freepage->lru)) { - list_cut_before(&sublist, freelist, &freepage->lru); + if (!list_is_last(freelist, &freepage->buddy_list)) { + list_cut_before(&sublist, freelist, &freepage->buddy_list); list_splice_tail(&sublist, freelist); } } @@ -1412,8 +1412,8 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage) { LIST_HEAD(sublist); - if (!list_is_first(freelist, &freepage->lru)) { - list_cut_position(&sublist, freelist, &freepage->lru); + if (!list_is_first(freelist, &freepage->buddy_list)) { + list_cut_position(&sublist, freelist, &freepage->buddy_list); list_splice_tail(&sublist, freelist); } } -- cgit From 4c17989116cb0a6a91f4184077c342a9097b748e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:37 +0800 Subject: mm/compaction: call list_is_{first}/{last} more intuitively in move_freelist_{head}/{tail} We use move_freelist_head after list_for_each_entry_reverse to skip recent pages. And there is no need to do actual move if all freepages are searched in list_for_each_entry_reverse, e.g. freepage point to first page in freelist. It's more intuitively to call list_is_first with list entry as the first argument and list head as the second argument to check if list entry is the first list entry instead of call list_is_last with list entry and list head passed in reverse. Similarly, call list_is_last in move_freelist_tail is more intuitively. Link: https://lkml.kernel.org/r/20230901155141.249860-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index e3ee1bc1c0ad..a40550a33aee 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1395,7 +1395,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage) { LIST_HEAD(sublist); - if (!list_is_last(freelist, &freepage->buddy_list)) { + if (!list_is_first(&freepage->buddy_list, freelist)) { list_cut_before(&sublist, freelist, &freepage->buddy_list); list_splice_tail(&sublist, freelist); } @@ -1412,7 +1412,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage) { LIST_HEAD(sublist); - if (!list_is_first(freelist, &freepage->buddy_list)) { + if (!list_is_last(&freepage->buddy_list, freelist)) { list_cut_position(&sublist, freelist, &freepage->buddy_list); list_splice_tail(&sublist, freelist); } -- cgit From 3da0272a4c7d0d37b47b28e87014f421296fc2be Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:38 +0800 Subject: mm/compaction: correctly return failure with bogus compound_order in strict mode In strict mode, we should return 0 if there is any hole in pageblock. If we successfully isolated pages at beginning at pageblock and then have a bogus compound_order outside pageblock in next page. We will abort search loop with blockpfn > end_pfn. Although we will limit blockpfn to end_pfn, we will treat it as a successful isolation in strict mode as blockpfn is not < end_pfn and return partial isolated pages. Then isolate_freepages_range may success unexpectly with hole in isolated range. Link: https://lkml.kernel.org/r/20230901155141.249860-4-shikemeng@huaweicloud.com Fixes: 9fcd6d2e052e ("mm, compaction: skip compound pages by order in free scanner") Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index a40550a33aee..9ecbfbc695e5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -626,11 +626,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (PageCompound(page)) { const unsigned int order = compound_order(page); - if (likely(order <= MAX_ORDER)) { + if (blockpfn + (1UL << order) <= end_pfn) { blockpfn += (1UL << order) - 1; page += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; } + goto isolate_fail; } @@ -678,8 +679,7 @@ isolate_fail: spin_unlock_irqrestore(&cc->zone->lock, flags); /* - * There is a tiny chance that we have read bogus compound_order(), - * so be careful to not go outside of the pageblock. + * Be careful to not go outside of the pageblock. */ if (unlikely(blockpfn > end_pfn)) blockpfn = end_pfn; -- cgit From 8df4e28c64188911fba33789bf2cb882b3ae524e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:39 +0800 Subject: mm/compaction: remove repeat compact_blockskip_flush check in reset_isolation_suitable We have compact_blockskip_flush check in __reset_isolation_suitable, just remove repeat check before __reset_isolation_suitable in compact_blockskip_flush. Link: https://lkml.kernel.org/r/20230901155141.249860-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 9ecbfbc695e5..c377d78e0f15 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -382,6 +382,7 @@ static void __reset_isolation_suitable(struct zone *zone) bool source_set = false; bool free_set = false; + /* Only flush if a full compaction finished recently */ if (!zone->compact_blockskip_flush) return; @@ -434,9 +435,7 @@ void reset_isolation_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - /* Only flush if a full compaction finished recently */ - if (zone->compact_blockskip_flush) - __reset_isolation_suitable(zone); + __reset_isolation_suitable(zone); } } -- cgit From 9cc17ede5125933ab47f8f359c2cce3aca8ee757 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:40 +0800 Subject: mm/compaction: improve comment of is_via_compact_memory We do proactive compaction with order == -1 via 1. /proc/sys/vm/compact_memory 2. /sys/devices/system/node/nodex/compact 3. /proc/sys/vm/compaction_proactiveness Add missed situation in which order == -1. Link: https://lkml.kernel.org/r/20230901155141.249860-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index c377d78e0f15..ff3426a0d9c5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2065,8 +2065,10 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) } /* - * order == -1 is expected when compacting via - * /proc/sys/vm/compact_memory + * order == -1 is expected when compacting proactively via + * 1. /proc/sys/vm/compact_memory + * 2. /sys/devices/system/node/nodex/compact + * 3. /proc/sys/vm/compaction_proactiveness */ static inline bool is_via_compact_memory(int order) { -- cgit From e19a3f595ae47bd8c034b98eb0b28a3877413387 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:41 +0800 Subject: mm/compaction: factor out code to test if we should run compaction for target order We always do zone_watermark_ok check and compaction_suitable check together to test if compaction for target order should be ran. Factor these code out to remove repeat code. Link: https://lkml.kernel.org/r/20230901155141.249860-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 66 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index ff3426a0d9c5..01ba298739dd 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2378,6 +2378,30 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, return false; } +/* + * Should we do compaction for target allocation order. + * Return COMPACT_SUCCESS if allocation for target order can be already + * satisfied + * Return COMPACT_SKIPPED if compaction for target order is likely to fail + * Return COMPACT_CONTINUE if compaction for target order should be ran + */ +static enum compact_result +compaction_suit_allocation_order(struct zone *zone, unsigned int order, + int highest_zoneidx, unsigned int alloc_flags) +{ + unsigned long watermark; + + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, + alloc_flags)) + return COMPACT_SUCCESS; + + if (!compaction_suitable(zone, order, highest_zoneidx)) + return COMPACT_SKIPPED; + + return COMPACT_CONTINUE; +} + static enum compact_result compact_zone(struct compact_control *cc, struct capture_control *capc) { @@ -2403,19 +2427,11 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->migratetype = gfp_migratetype(cc->gfp_mask); if (!is_via_compact_memory(cc->order)) { - unsigned long watermark; - - /* Allocation can already succeed, nothing to do */ - watermark = wmark_pages(cc->zone, - cc->alloc_flags & ALLOC_WMARK_MASK); - if (zone_watermark_ok(cc->zone, cc->order, watermark, - cc->highest_zoneidx, cc->alloc_flags)) - return COMPACT_SUCCESS; - - /* Compaction is likely to fail */ - if (!compaction_suitable(cc->zone, cc->order, - cc->highest_zoneidx)) - return COMPACT_SKIPPED; + ret = compaction_suit_allocation_order(cc->zone, cc->order, + cc->highest_zoneidx, + cc->alloc_flags); + if (ret != COMPACT_CONTINUE) + return ret; } /* @@ -2914,6 +2930,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) int zoneid; struct zone *zone; enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; + enum compact_result ret; for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; @@ -2921,14 +2938,10 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - /* Allocation can already succeed, check other zones */ - if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, - min_wmark_pages(zone), - highest_zoneidx, 0)) - continue; - - if (compaction_suitable(zone, pgdat->kcompactd_max_order, - highest_zoneidx)) + ret = compaction_suit_allocation_order(zone, + pgdat->kcompactd_max_order, + highest_zoneidx, ALLOC_WMARK_MIN); + if (ret == COMPACT_CONTINUE) return true; } @@ -2951,6 +2964,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; + enum compact_result ret; + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.highest_zoneidx); count_compact_event(KCOMPACTD_WAKE); @@ -2965,12 +2980,9 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_deferred(zone, cc.order)) continue; - /* Allocation can already succeed, nothing to do */ - if (zone_watermark_ok(zone, cc.order, - min_wmark_pages(zone), zoneid, 0)) - continue; - - if (!compaction_suitable(zone, cc.order, zoneid)) + ret = compaction_suit_allocation_order(zone, + cc.order, zoneid, ALLOC_WMARK_MIN); + if (ret != COMPACT_CONTINUE) continue; if (kthread_should_stop()) -- cgit From 954652b9f33bb1892ea4448479d78779e4a8ae13 Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Tue, 29 Aug 2023 17:45:49 -0700 Subject: mm/mremap: fix unaccount of memory on vma_merge() failure Fix mremap so that only accounted memory is unaccounted if the mapping is expandable but vma_merge() fails. Link: https://lkml.kernel.org/r/20230830004549.16131-1-anthony.yznaga@oracle.com Fixes: fdbef6149135 ("mm/mremap: don't account pages in vma_to_resize()") Signed-off-by: Anthony Yznaga Acked-by: Brian Geffon Signed-off-by: Andrew Morton --- mm/mremap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index 382e81c33fc4..fbb4861964f6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1037,12 +1037,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, pgoff_t extension_pgoff = vma->vm_pgoff + ((extension_start - vma->vm_start) >> PAGE_SHIFT); VMA_ITERATOR(vmi, mm, extension_start); + long charged = 0; if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { ret = -ENOMEM; goto out; } + charged = pages; } /* @@ -1058,7 +1060,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (!vma) { - vm_unacct_memory(pages); + vm_unacct_memory(charged); ret = -ENOMEM; goto out; } -- cgit From dd34d9fe3b427dea47f81443d022dbaec4339f7e Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Tue, 29 Aug 2023 17:43:24 -0700 Subject: mm: fix unaccount of memory on vma_link() failure Fix insert_vm_struct() so that only accounted memory is unaccounted if vma_link() fails. Link: https://lkml.kernel.org/r/20230830004324.16101-1-anthony.yznaga@oracle.com Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree") Signed-off-by: Anthony Yznaga Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/mmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index b56a7f0c9f85..1ec6c875a7f9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3278,7 +3278,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) } if (vma_link(mm, vma)) { - vm_unacct_memory(charged); + if (vma->vm_flags & VM_ACCOUNT) + vm_unacct_memory(charged); return -ENOMEM; } -- cgit From d8f5f7e445f02eb10dee1a0a992146314cf460f8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 29 Aug 2023 14:37:34 -0700 Subject: hugetlb: set hugetlb page flag before optimizing vmemmap Currently, vmemmap optimization of hugetlb pages is performed before the hugetlb flag (previously hugetlb destructor) is set identifying it as a hugetlb folio. This means there is a window of time where an ordinary folio does not have all associated vmemmap present. The core mm only expects vmemmap to be potentially optimized for hugetlb and device dax. This can cause problems in code such as memory error handling that may want to write to tail struct pages. There is only one call to perform hugetlb vmemmap optimization today. To fix this issue, simply set the hugetlb flag before that call. There was a similar issue in the free hugetlb path that was previously addressed. The two routines that optimize or restore hugetlb vmemmap should only be passed hugetlb folios/pages. To catch any callers not following this rule, add VM_WARN_ON calls to the routines. In the hugetlb free code paths, some calls could be made to restore vmemmap after clearing the hugetlb flag. This was 'safe' as in these cases vmemmap was already present and the call was a NOOP. However, for consistency these calls where eliminated so that we can add the VM_WARN_ON checks. Link: https://lkml.kernel.org/r/20230829213734.69673-1-mike.kravetz@oracle.com Fixes: f41f2ed43ca5 ("mm: hugetlb: free the vmemmap pages associated with each HugeTLB page") Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: James Houghton Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/hugetlb.c | 31 ++++++++++++++++++++++--------- mm/hugetlb_vmemmap.c | 3 +++ 2 files changed, 25 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 52d26072dfda..afce9037f9ee 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1720,7 +1720,12 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - if (hugetlb_vmemmap_restore(h, &folio->page)) { + /* + * If folio is not vmemmap optimized (!clear_dtor), then the folio + * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore + * can only be passed hugetlb pages and will BUG otherwise. + */ + if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1930,9 +1935,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { + folio_set_hugetlb(folio); hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); - folio_set_hugetlb(folio); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); @@ -3580,13 +3585,21 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_restore(h, &folio->page); - if (rc) { - /* Allocation of vmemmmap failed, we can not demote folio */ - spin_lock_irq(&hugetlb_lock); - folio_ref_unfreeze(folio, 1); - add_hugetlb_folio(h, folio, false); - return rc; + /* + * If vmemmap already existed for folio, the remove routine above would + * have cleared the hugetlb folio flag. Hence the folio is technically + * no longer a hugetlb folio. hugetlb_vmemmap_restore can only be + * passed hugetlb folios and will BUG otherwise. + */ + if (folio_test_hugetlb(folio)) { + rc = hugetlb_vmemmap_restore(h, &folio->page); + if (rc) { + /* Allocation of vmemmmap failed, we can not demote folio */ + spin_lock_irq(&hugetlb_lock); + folio_ref_unfreeze(folio, 1); + add_hugetlb_folio(h, folio, false); + return rc; + } } /* diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4b9734777f69..aeb7dd889eee 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include "hugetlb_vmemmap.h" @@ -456,6 +457,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; + VM_WARN_ON_ONCE(!PageHuge(head)); if (!HPageVmemmapOptimized(head)) return 0; @@ -550,6 +552,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; + VM_WARN_ON_ONCE(!PageHuge(head)); if (!vmemmap_should_optimize(h, head)) return; -- cgit From b72b3c9c34c825c81d205241c5f822fc7835923f Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Tue, 29 Aug 2023 11:33:43 +0800 Subject: mm/hugetlb: fix nodes huge page allocation when there are surplus pages In set_nr_huge_pages(), local variable "count" is used to record persistent_huge_pages(), but when it cames to nodes huge page allocation, the semantics changes to nr_huge_pages. When there exists surplus huge pages and using the interface under /sys/devices/system/node/node*/hugepages to change huge page pool size, this difference can result in the allocation of an unexpected number of huge pages. Steps to reproduce the bug: Starting with: Node 0 Node 1 Total HugePages_Total 0.00 0.00 0.00 HugePages_Free 0.00 0.00 0.00 HugePages_Surp 0.00 0.00 0.00 create 100 huge pages in Node 0 and consume it, then set Node 0 's nr_hugepages to 0. yields: Node 0 Node 1 Total HugePages_Total 200.00 0.00 200.00 HugePages_Free 0.00 0.00 0.00 HugePages_Surp 200.00 0.00 200.00 write 100 to Node 1's nr_hugepages echo 100 > /sys/devices/system/node/node1/\ hugepages/hugepages-2048kB/nr_hugepages gets: Node 0 Node 1 Total HugePages_Total 200.00 400.00 600.00 HugePages_Free 0.00 400.00 400.00 HugePages_Surp 200.00 0.00 200.00 Kernel is expected to create only 100 huge pages and it gives 200. Link: https://lkml.kernel.org/r/20230829033343.467779-1-xueshi.hu@smartx.com Fixes: 9a30523066cd ("hugetlb: add per node hstate attributes") Signed-off-by: Xueshi Hu Reviewed-by: Mike Kravetz Cc: Andi Kleen Cc: Lee Schermerhorn Cc: Mel Gorman Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index afce9037f9ee..7c90c43574a6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3457,7 +3457,9 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, if (nid != NUMA_NO_NODE) { unsigned long old_count = count; - count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + count += persistent_huge_pages(h) - + (h->nr_huge_pages_node[nid] - + h->surplus_huge_pages_node[nid]); /* * User may have specified a large count value which caused the * above calculation to overflow. In this case, they wanted -- cgit From 80e4a765a770c491e081463c38be927a876e2ce8 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 27 Aug 2023 12:08:48 +0100 Subject: mm: refactor si_mem_available() si_mem_available() needlessly places LRU statistics into an array before retrieving only two of them, simply access those directly. In addition, refactor the code so that the blocks of code which calculate the page cache and reclaimable components each resemble one another to clearly indicate we cap both against wmark_low in the same fashion. Link: https://lkml.kernel.org/r/20230827110848.43510-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/show_mem.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/show_mem.c b/mm/show_mem.c index 4b888b18bdde..ba0808d6917f 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -34,13 +34,8 @@ long si_mem_available(void) long available; unsigned long pagecache; unsigned long wmark_low = 0; - unsigned long pages[NR_LRU_LISTS]; unsigned long reclaimable; struct zone *zone; - int lru; - - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) wmark_low += low_wmark_pages(zone); @@ -56,7 +51,8 @@ long si_mem_available(void) * start swapping or thrashing. Assume at least half of the page * cache, or the low watermark worth of cache, needs to stay. */ - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache = global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE); pagecache -= min(pagecache / 2, wmark_low); available += pagecache; @@ -67,7 +63,8 @@ long si_mem_available(void) */ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); - available += reclaimable - min(reclaimable / 2, wmark_low); + reclaimable -= min(reclaimable / 2, wmark_low); + available += reclaimable; if (available < 0) available = 0; -- cgit From 97144ce008f918249fa7275ee1d29f6f27665c34 Mon Sep 17 00:00:00 2001 From: Vern Hao Date: Fri, 25 Aug 2023 15:57:34 +0800 Subject: mm/vmscan: use folio_migratetype() instead of get_pageblock_migratetype() In skip_cma(), we can use folio_migratetype() to replace get_pageblock_migratetype(). Link: https://lkml.kernel.org/r/20230825075735.52436-1-user@VERNHAO-MC1 Signed-off-by: Vern Hao Reviewed-by: David Hildenbrand Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f13394b112e..93fdeadea896 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2271,7 +2271,7 @@ static bool skip_cma(struct folio *folio, struct scan_control *sc) { return !current_is_kswapd() && gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && - get_pageblock_migratetype(&folio->page) == MIGRATE_CMA; + folio_migratetype(folio) == MIGRATE_CMA; } #else static bool skip_cma(struct folio *folio, struct scan_control *sc) -- cgit From bc0c3357601e3ff1b006600530079bd246ef0d82 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 23 Aug 2023 19:05:56 +0200 Subject: mm: remove remnants of SPLIT_RSS_COUNTING The feature got retired in f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"), but the patch failed to fully clean it up. Link: https://lkml.kernel.org/r/20230823170556.2281747-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Acked-by: Shakeel Butt Signed-off-by: Andrew Morton --- mm/madvise.c | 5 +---- mm/memory.c | 2 -- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 4dded5d27e7e..59e8860c86af 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -746,11 +746,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, folio_mark_lazyfree(folio); } - if (nr_swap) { - if (current->mm == mm) - sync_mm_rss(mm); + if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); - } if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); diff --git a/mm/memory.c b/mm/memory.c index 6c264d2f969c..0739ccb00e61 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -471,8 +471,6 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; - if (current->mm == mm) - sync_mm_rss(mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); -- cgit From 91e79d22be75fec88ae58d274a7c9e49d6215099 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 23 Aug 2023 00:13:14 +0100 Subject: mm: convert DAX lock/unlock page to lock/unlock folio The one caller of DAX lock/unlock page already calls compound_head(), so use page_folio() instead, then use a folio throughout the DAX code to remove uses of page->mapping and page->index. [jane.chu@oracle.com: add comment to mf_generic_kill_procss(), simplify mf_generic_kill_procs:folio initialization] Link: https://lkml.kernel.org/r/20230908222336.186313-1-jane.chu@oracle.com Link: https://lkml.kernel.org/r/20230822231314.349200-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Jane Chu Acked-by: Naoya Horiguchi Cc: Dan Williams Cc: Jane Chu Signed-off-by: Andrew Morton --- mm/memory-failure.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4d6e43c88489..660c21859118 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1713,20 +1713,23 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags); } +/* + * Only dev_pagemap pages get here, such as fsdax when the filesystem + * either do not claim or fails to claim a hwpoison event, or devdax. + * The fsdax pages are initialized per base page, and the devdax pages + * could be initialized either as base pages, or as compound pages with + * vmemmap optimization enabled. Devdax is simplistic in its dealing with + * hwpoison, such that, if a subpage of a compound page is poisoned, + * simply mark the compound head page is by far sufficient. + */ static int mf_generic_kill_procs(unsigned long long pfn, int flags, struct dev_pagemap *pgmap) { - struct page *page = pfn_to_page(pfn); + struct folio *folio = pfn_folio(pfn); LIST_HEAD(to_kill); dax_entry_t cookie; int rc = 0; - /* - * Pages instantiated by device-dax (not filesystem-dax) - * may be compound pages. - */ - page = compound_head(page); - /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by @@ -1734,11 +1737,11 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags, * also prevents changes to the mapping of this pfn until * poison signaling is complete. */ - cookie = dax_lock_page(page); + cookie = dax_lock_folio(folio); if (!cookie) return -EBUSY; - if (hwpoison_filter(page)) { + if (hwpoison_filter(&folio->page)) { rc = -EOPNOTSUPP; goto unlock; } @@ -1760,7 +1763,7 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags, * Use this flag as an indication that the dax page has been * remapped UC to prevent speculative consumption of poison. */ - SetPageHWPoison(page); + SetPageHWPoison(&folio->page); /* * Unlike System-RAM there is no possibility to swap in a @@ -1769,11 +1772,11 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags, * SIGBUS (i.e. MF_MUST_KILL) */ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; - collect_procs(page, &to_kill, true); + collect_procs(&folio->page, &to_kill, true); - unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags); + unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags); unlock: - dax_unlock_page(page, cookie); + dax_unlock_folio(folio, cookie); return rc; } -- cgit From 77cd814835df22e177a9caac1b046f7ffdfeedd0 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 4 Sep 2023 17:08:49 +0200 Subject: mm/vmstat: use this_cpu_try_cmpxchg in mod_{zone,node}_state Use this_cpu_try_cmpxchg instead of this_cpu_cmpxchg (*ptr, old, new) == old in mod_zone_state and mod_node_state. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails. There is no need to re-read the value in the loop. No functional change intended. Link: https://lkml.kernel.org/r/20230904150917.8318-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- mm/vmstat.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 00e81e99c6ee..894e4c88d241 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -559,8 +559,10 @@ static inline void mod_zone_state(struct zone *zone, { struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; - long o, n, t, z; + long n, t, z; + s8 o; + o = this_cpu_read(*p); do { z = 0; /* overflow to zone counters */ @@ -576,8 +578,7 @@ static inline void mod_zone_state(struct zone *zone, */ t = this_cpu_read(pcp->stat_threshold); - o = this_cpu_read(*p); - n = delta + o; + n = delta + (long)o; if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; @@ -586,7 +587,7 @@ static inline void mod_zone_state(struct zone *zone, z = n + os; n = -os; } - } while (this_cpu_cmpxchg(*p, o, n) != o); + } while (!this_cpu_try_cmpxchg(*p, &o, n)); if (z) zone_page_state_add(z, zone, item); @@ -616,7 +617,8 @@ static inline void mod_node_state(struct pglist_data *pgdat, { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; s8 __percpu *p = pcp->vm_node_stat_diff + item; - long o, n, t, z; + long n, t, z; + s8 o; if (vmstat_item_in_bytes(item)) { /* @@ -629,6 +631,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, delta >>= PAGE_SHIFT; } + o = this_cpu_read(*p); do { z = 0; /* overflow to node counters */ @@ -644,8 +647,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, */ t = this_cpu_read(pcp->stat_threshold); - o = this_cpu_read(*p); - n = delta + o; + n = delta + (long)o; if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; @@ -654,7 +656,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, z = n + os; n = -os; } - } while (this_cpu_cmpxchg(*p, o, n) != o); + } while (!this_cpu_try_cmpxchg(*p, &o, n)); if (z) node_page_state_add(z, pgdat, item); -- cgit From 2eaa6c2abb9dd55041a05c20c451790c124d5cf0 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Tue, 5 Sep 2023 20:45:03 +0800 Subject: mm: hugetlb_vmemmap: fix hugetlb page number decrease failed on movable nodes The decreasing of hugetlb pages number failed with the following message given: sh: page allocation failure: order:0, mode:0x204cc0(GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_THISNODE) CPU: 1 PID: 112 Comm: sh Not tainted 6.5.0-rc7-... #45 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace.part.6+0x84/0xe4 show_stack+0x18/0x24 dump_stack_lvl+0x48/0x60 dump_stack+0x18/0x24 warn_alloc+0x100/0x1bc __alloc_pages_slowpath.constprop.107+0xa40/0xad8 __alloc_pages+0x244/0x2d0 hugetlb_vmemmap_restore+0x104/0x1e4 __update_and_free_hugetlb_folio+0x44/0x1f4 update_and_free_hugetlb_folio+0x20/0x68 update_and_free_pages_bulk+0x4c/0xac set_max_huge_pages+0x198/0x334 nr_hugepages_store_common+0x118/0x178 nr_hugepages_store+0x18/0x24 kobj_attr_store+0x18/0x2c sysfs_kf_write+0x40/0x54 kernfs_fop_write_iter+0x164/0x1dc vfs_write+0x3a8/0x460 ksys_write+0x6c/0x100 __arm64_sys_write+0x1c/0x28 invoke_syscall+0x44/0x100 el0_svc_common.constprop.1+0x6c/0xe4 do_el0_svc+0x38/0x94 el0_svc+0x28/0x74 el0t_64_sync_handler+0xa0/0xc4 el0t_64_sync+0x174/0x178 Mem-Info: ... The reason is that the hugetlb pages being released are allocated from movable nodes, and with hugetlb_optimize_vmemmap enabled, vmemmap pages need to be allocated from the same node during the hugetlb pages releasing. With GFP_KERNEL and __GFP_THISNODE set, allocating from movable node is always failed. Fix this problem by removing __GFP_THISNODE. Link: https://lkml.kernel.org/r/20230905124503.24899-1-yuancan@huawei.com Fixes: ad2fa3717b74 ("mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page") Signed-off-by: Yuan Can Reviewed-by: Muchun Song Cc: Kefeng Wang Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index aeb7dd889eee..3e5387835ad6 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -381,7 +381,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, struct list_head *list) { - gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE; + gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; -- cgit From af8ca1c149069176e6322a77b532e3ffd99ccffe Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:22 +0000 Subject: mm/mremap: optimize the start addresses in move_page_tables() Patch series "Optimize mremap during mutual alignment within PMD", v6. This patchset optimizes the start addresses in move_page_tables() and tests the changes. It addresses a warning [1] that occurs due to a downward, overlapping move on a mutually-aligned offset within a PMD during exec. By initiating the copy process at the PMD level when such alignment is present, we can prevent this warning and speed up the copying process at the same time. Linus Torvalds suggested this idea. Check the individual patches for more details. [1] https://lore.kernel.org/all/ZB2GTBD%2FLWTrkOiO@dhcp22.suse.cz/ This patch (of 7): Recently, we see reports [1] of a warning that triggers due to move_page_tables() doing a downward and overlapping move on a mutually-aligned offset within a PMD. By mutual alignment, I mean the source and destination addresses of the mremap are at the same offset within a PMD. This mutual alignment along with the fact that the move is downward is sufficient to cause a warning related to having an allocated PMD that does not have PTEs in it. This warning will only trigger when there is mutual alignment in the move operation. A solution, as suggested by Linus Torvalds [2], is to initiate the copy process at the PMD level whenever such alignment is present. Implementing this approach will not only prevent the warning from being triggered, but it will also optimize the operation as this method should enhance the speed of the copy process whenever there's a possibility to start copying at the PMD level. Some more points: a. The optimization can be done only when both the source and destination of the mremap do not have anything mapped below it up to a PMD boundary. I add support to detect that. b. #1 is not a problem for the call to move_page_tables() from exec.c as nothing is expected to be mapped below the source. However, for non-overlapping mutually aligned moves as triggered by mremap(2), I added support for checking such cases. c. I currently only optimize for PMD moves, in the future I/we can build on this work and do PUD moves as well if there is a need for this. But I want to take it one step at a time. d. We need to be careful about mremap of ranges within the VMA itself. For this purpose, I added checks to determine if the address after alignment falls within its VMA itself. [1] https://lore.kernel.org/all/ZB2GTBD%2FLWTrkOiO@dhcp22.suse.cz/ [2] https://lore.kernel.org/all/CAHk-=whd7msp8reJPfeGNyt0LiySMT0egExx3TVZSX3Ok6X=9g@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230903151328.2981432-1-joel@joelfernandes.org Link: https://lkml.kernel.org/r/20230903151328.2981432-2-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Suggested-by: Linus Torvalds Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mremap.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index fbb4861964f6..e2b65a17148e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -489,6 +489,53 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, return moved; } +/* + * A helper to check if a previous mapping exists. Required for + * move_page_tables() and realign_addr() to determine if a previous mapping + * exists before we can do realignment optimizations. + */ +static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, + unsigned long mask) +{ + unsigned long addr_masked = addr_to_align & mask; + + /* + * If @addr_to_align of either source or destination is not the beginning + * of the corresponding VMA, we can't align down or we will destroy part + * of the current mapping. + */ + if (vma->vm_start != addr_to_align) + return false; + + /* + * Make sure the realignment doesn't cause the address to fall on an + * existing mapping. + */ + return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL; +} + +/* Opportunistically realign to specified boundary for faster copy. */ +static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma, + unsigned long *new_addr, struct vm_area_struct *new_vma, + unsigned long mask) +{ + /* Skip if the addresses are already aligned. */ + if ((*old_addr & ~mask) == 0) + return; + + /* Only realign if the new and old addresses are mutually aligned. */ + if ((*old_addr & ~mask) != (*new_addr & ~mask)) + return; + + /* Ensure realignment doesn't cause overlap with existing mappings. */ + if (!can_align_down(old_vma, *old_addr, mask) || + !can_align_down(new_vma, *new_addr, mask)) + return; + + *old_addr = *old_addr & mask; + *new_addr = *new_addr & mask; +} + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, @@ -508,6 +555,14 @@ unsigned long move_page_tables(struct vm_area_struct *vma, return move_hugetlb_page_tables(vma, new_vma, old_addr, new_addr, len); + /* + * If possible, realign addresses to PMD boundary for faster copy. + * Only realign if the mremap copying hits a PMD boundary. + */ + if ((vma != new_vma) + && (len >= PMD_SIZE - (old_addr & ~PMD_MASK))) + try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK); + flush_cache_range(vma, old_addr, old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, old_addr, old_end); @@ -577,6 +632,13 @@ again: mmu_notifier_invalidate_range_end(&range); + /* + * Prevent negative return values when {old,new}_addr was realigned + * but we broke out of the above loop for the first PMD itself. + */ + if (len + old_addr < old_end) + return 0; + return len + old_addr - old_end; /* how much done */ } -- cgit From b1e5a3dee255a11cbdd5a0e814829276bd33a793 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:23 +0000 Subject: mm/mremap: allow moves within the same VMA for stack moves For the stack move happening in shift_arg_pages(), the move is happening within the same VMA which spans the old and new ranges. In case the aligned address happens to fall within that VMA, allow such moves and don't abort the mremap alignment optimization. In the regular non-stack mremap case, we cannot allow any such moves as will end up destroying some part of the mapping (either the source of the move, or part of the existing mapping). So just avoid it for stack moves. Link: https://lkml.kernel.org/r/20230903151328.2981432-3-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mremap.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index e2b65a17148e..ce8a23ef325a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -490,12 +490,13 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, } /* - * A helper to check if a previous mapping exists. Required for - * move_page_tables() and realign_addr() to determine if a previous mapping - * exists before we can do realignment optimizations. + * A helper to check if aligning down is OK. The aligned address should fall + * on *no mapping*. For the stack moving down, that's a special move within + * the VMA that is created to span the source and destination of the move, + * so we make an exception for it. */ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, - unsigned long mask) + unsigned long mask, bool for_stack) { unsigned long addr_masked = addr_to_align & mask; @@ -504,9 +505,13 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali * of the corresponding VMA, we can't align down or we will destroy part * of the current mapping. */ - if (vma->vm_start != addr_to_align) + if (!for_stack && vma->vm_start != addr_to_align) return false; + /* In the stack case we explicitly permit in-VMA alignment. */ + if (for_stack && addr_masked >= vma->vm_start) + return true; + /* * Make sure the realignment doesn't cause the address to fall on an * existing mapping. @@ -517,7 +522,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali /* Opportunistically realign to specified boundary for faster copy. */ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma, unsigned long *new_addr, struct vm_area_struct *new_vma, - unsigned long mask) + unsigned long mask, bool for_stack) { /* Skip if the addresses are already aligned. */ if ((*old_addr & ~mask) == 0) @@ -528,8 +533,8 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old return; /* Ensure realignment doesn't cause overlap with existing mappings. */ - if (!can_align_down(old_vma, *old_addr, mask) || - !can_align_down(new_vma, *new_addr, mask)) + if (!can_align_down(old_vma, *old_addr, mask, for_stack) || + !can_align_down(new_vma, *new_addr, mask, for_stack)) return; *old_addr = *old_addr & mask; @@ -539,7 +544,7 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, - bool need_rmap_locks) + bool need_rmap_locks, bool for_stack) { unsigned long extent, old_end; struct mmu_notifier_range range; @@ -559,9 +564,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma, * If possible, realign addresses to PMD boundary for faster copy. * Only realign if the mremap copying hits a PMD boundary. */ - if ((vma != new_vma) - && (len >= PMD_SIZE - (old_addr & ~PMD_MASK))) - try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK); + if (len >= PMD_SIZE - (old_addr & ~PMD_MASK)) + try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK, + for_stack); flush_cache_range(vma, old_addr, old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, @@ -708,7 +713,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, - need_rmap_locks); + need_rmap_locks, false); if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { @@ -722,7 +727,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, * and then proceed to unmap new area instead of old. */ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, - true); + true, false); vma = new_vma; old_len = new_len; old_addr = new_addr; -- cgit From 64d4d49c5f3bd9a8a24f52d1e9d16af1b368114b Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 8 Sep 2023 16:51:15 -0700 Subject: zswap: change zswap's default allocator to zsmalloc Out of zswap's 3 allocators, zsmalloc is the clear superior in terms of memory utilization, both in theory and as observed in practice, with its high storage density and low internal fragmentation. zsmalloc is also more actively developed and maintained, since it is the allocator of choice for zswap for many users, as well as the only allocator for zram. A historical objection to the selection of zsmalloc as the default allocator for zswap is its lack of writeback capability. However, this has changed, with the zsmalloc writeback patchset, and the subsequent zswap LRU refactor. With this, there is not a lot of good reasons to keep zbud, an otherwise inferior allocator, as the default instead of zswap. This patch changes the default allocator to zsmalloc. The only exception is on settings without MMU, in which case zbud will remain as the default. Link: https://lkml.kernel.org/r/20230908235115.2943486-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 264a2df5ecf5..45f2971ef642 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -130,6 +130,7 @@ config ZSWAP_COMPRESSOR_DEFAULT choice prompt "Default allocator" depends on ZSWAP + default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU default ZSWAP_ZPOOL_DEFAULT_ZBUD help Selects the default allocator for the compressed cache for -- cgit From 27e68c4b0d5a945d975140315332ea2e7aa2c57b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:26 +0000 Subject: mm/damon/core: fix a comment about damon_set_attrs() call timings The comment on damon_set_attrs() says it should not be called while the kdamond is running, but now some DAMON modules like sysfs interface and DAMON_RECLAIM call it from after_aggregation() and/or after_wmarks_check() callbacks for online tuning. Update the comment. Link: https://lkml.kernel.org/r/20230907022929.91361-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/damon/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index bcd2bd9d6c10..9160a0674aff 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -541,7 +541,11 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx, * @ctx: monitoring context * @attrs: monitoring attributes * - * This function should not be called while the kdamond is running. + * This function should be called while the kdamond is not running, or an + * access check results aggregation is not ongoing (e.g., from + * &struct damon_callback->after_aggregation or + * &struct damon_callback->after_wmarks_check callbacks). + * * Every time interval is in micro-seconds. * * Return: 0 on success, negative error code otherwise. -- cgit From 2d00946bd7f4e8c17cbd2fce5fd7c3ab58046dff Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:29 +0000 Subject: mm/damon/core: remove 'struct target *' parameter from damon_aggregated tracepoint damon_aggregateed tracepoint is receiving 'struct target *', but doesn't use it. Remove it from the prototype. Link: https://lkml.kernel.org/r/20230907022929.91361-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index 9160a0674aff..ca631dd88b33 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -776,7 +776,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) struct damon_region *r; damon_for_each_region(r, t) { - trace_damon_aggregated(t, ti, r, damon_nr_regions(t)); + trace_damon_aggregated(ti, r, damon_nr_regions(t)); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } -- cgit From 7fa38d0ea00ffe2cd3c95c96c85221b8ae221875 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 6 Sep 2023 10:33:12 +0000 Subject: mm: remove duplicated vma->vm_flags check when expanding stack expand_upwards() and expand_downwards() will return -EFAULT if VM_GROWSUP or VM_GROWSDOWN is not correctly set in vma->vm_flags, however in !CONFIG_STACK_GROWSUP case, expand_stack_locked() returns -EINVAL first if !(vma->vm_flags & VM_GROWSDOWN) before calling expand_downwards(), to keep the consistency with CONFIG_STACK_GROWSUP case, remove this check. The usages of this function are as below: A:fs/exec.c ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; or B:mm/memory.c mm/mmap.c if (expand_stack_locked(vma, addr)) return NULL; which means the return value will not propagate to other places, so I believe there is no user-visible effects of this change, and it's unnecessary to backport to earlier versions. Link: https://lkml.kernel.org/r/20230906103312.645712-1-xiujianfeng@huaweicloud.com Fixes: f440fa1ac955 ("mm: make find_extend_vma() fail if write lock not held") Signed-off-by: Xiu Jianfeng Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/mmap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 1ec6c875a7f9..42870c4d8704 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2159,8 +2159,6 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned lon #else int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) - return -EINVAL; return expand_downwards(vma, address); } -- cgit From 6a898c2757af1ac852bb917a0866d2724f303076 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Wed, 6 Sep 2023 17:31:57 +0800 Subject: mm: hugetlb_vmemmap: allow alloc vmemmap pages fallback to other nodes In vmemmap_remap_free(), a new head vmemmap page is allocated to avoid breaking a contiguous block of struct page memory, however, the allocation can always fail when the given node is movable node. Remove the __GFP_THISNODE to help avoid fragmentation. Link: https://lkml.kernel.org/r/20230906093157.9737-1-yuancan@huawei.com Signed-off-by: Yuan Can Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Suggested-by: Muchun Song Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 3e5387835ad6..0bde38626d25 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -320,8 +320,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .vmemmap_pages = &vmemmap_pages, }; int nid = page_to_nid((struct page *)start); - gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY | - __GFP_NOWARN; + gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; /* * Allocate a new head vmemmap page to avoid breaking a contiguous -- cgit From 40dca9b3d65a20c4b7a48252ece572132a299302 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Wed, 6 Sep 2023 17:11:13 +0800 Subject: mm/mm_init.c: remove redundant pr_info when node is memoryless There is a similar pr_info in free_area_init_node(), so remove the redundant pr_info. before: [ 0.006314] Initializing node 0 as memoryless [ 0.006445] Initmem setup node 0 as memoryless [ 0.006450] Initmem setup node 1 [mem 0x0000000000001000-0x000000003fffffff] [ 0.006453] Initmem setup node 2 [mem 0x0000000040000000-0x000000007ffd7fff] [ 0.006454] Initializing node 3 as memoryless [ 0.006584] Initmem setup node 3 as memoryless [ 0.006585] Initmem setup node 4 [mem 0x0000000100000000-0x00000001bfffffff] [ 0.006586] Initmem setup node 5 [mem 0x00000001c0000000-0x00000001ffffffff] [ 0.006587] Initmem setup node 6 [mem 0x0000000200000000-0x000000023fffffff] after: [ 0.004147] Initmem setup node 0 as memoryless [ 0.004148] Initmem setup node 1 [mem 0x0000000000001000-0x000000003fffffff] [ 0.004150] Initmem setup node 2 [mem 0x0000000040000000-0x000000007ffd7fff] [ 0.004154] Initmem setup node 3 as memoryless [ 0.004155] Initmem setup node 4 [mem 0x0000000100000000-0x00000001bfffffff] [ 0.004156] Initmem setup node 5 [mem 0x00000001c0000000-0x00000001ffffffff] [ 0.004157] Initmem setup node 6 [mem 0x0000000200000000-0x000000023fffffff] Link: https://lkml.kernel.org/r/20230906091113.4029983-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Reviewed-by: David Hildenbrand Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index 50f2f34745af..6be6f50813b1 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1871,8 +1871,6 @@ void __init free_area_init(unsigned long *max_zone_pfn) pg_data_t *pgdat; if (!node_online(nid)) { - pr_info("Initializing node %d as memoryless\n", nid); - /* Allocator not initialized yet */ pgdat = arch_alloc_nodedata(nid); if (!pgdat) -- cgit From 037dd8f9024bdd5540e2b66eba56c279f740b23c Mon Sep 17 00:00:00 2001 From: Angus Chen Date: Wed, 6 Sep 2023 16:37:00 +0800 Subject: mm/vmscan: print err before panic If panic is enable,the err information will not be printed before bugon, So swap it. Print the return value of PTR_ERR(pgdat->kswapd) also. Link: https://lkml.kernel.org/r/20230906083700.181-1-angus.chen@jaguarmicro.com Signed-off-by: Angus Chen Signed-off-by: Andrew Morton --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 93fdeadea896..4062bcca5311 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7892,8 +7892,9 @@ void __meminit kswapd_run(int nid) pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ + pr_err("Failed to start kswapd on node %d,ret=%ld\n", + nid, PTR_ERR(pgdat->kswapd)); BUG_ON(system_state < SYSTEM_RUNNING); - pr_err("Failed to start kswapd on node %d\n", nid); pgdat->kswapd = NULL; } } -- cgit From 84e8e54e2ed9d857c4e10a7e487532203385249c Mon Sep 17 00:00:00 2001 From: Ying Sun Date: Wed, 6 Sep 2023 12:50:12 +0800 Subject: mm/shmem: remove dead code can not be satisfied by "(CONFIG_SHMEM)&&(!(CONFIG_SHMEM))" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value of “.fs_flags” in line 4608 is a dead code which will never be implemented,because its conditions of line 47 "#ifdef CONFIG_SHMEM" and line 4607 are mutually exclusive. It is recommended to delete redundant code. Link: https://lkml.kernel.org/r/20230906045012.14999-1-sunying@nj.iscas.ac.cn Signed-off-by: Ying Sun Suggested-by: Yanjie Ren Signed-off-by: Andrew Morton --- mm/shmem.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 69595d341882..8539d39ec9a3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4585,11 +4585,7 @@ static struct file_system_type shmem_fs_type = { .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, -#ifdef CONFIG_SHMEM .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, -#else - .fs_flags = FS_USERNS_MOUNT, -#endif }; void __init shmem_init(void) -- cgit From 1717449b4417da2d24e2f9db95239e85f1317002 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 12 Sep 2023 01:17:20 +1000 Subject: memfd: drop warning for missing exec-related flags Commit 434ed3350f57 ("memfd: improve userspace warnings for missing exec-related flags") attempted to make these warnings more useful (so they would work as an incentive to get users to switch to specifying these flags -- as intended by the original MFD_NOEXEC_SEAL patchset). Unfortunately, it turns out that even INFO-level logging is too extreme to enable by default and alternative solutions to the spam issue (such as doing more extreme rate-limiting per-task) are either too ugly or overkill for something as simple as emitting a log as a developer aid. Given that the flags are new and there is no harm to not specifying them (after all, we maintain backwards compatibility) we can just drop the warnings for now until some time in the future when most programs have migrated and distributions start using vm.memfd_noexec=1 (where failing to pass the flag would result in unexpected errors for programs that use executable memfds). Link: https://lkml.kernel.org/r/20230912-memfd-reduce-spam-v2-1-7d92a4964b6a@cyphar.com Fixes: 434ed3350f57 ("memfd: improve userspace warnings for missing exec-related flags") Fixes: 2562d67b1bdf ("revert "memfd: improve userspace warnings for missing exec-related flags".") Signed-off-by: Aleksa Sarai Reported-by: Damian Tometzki Reviewed-by: Christian Brauner Cc: Daniel Verkamp Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/memfd.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm') diff --git a/mm/memfd.c b/mm/memfd.c index 2dba2cb6f0d0..d3a1ba4208c9 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -315,12 +315,6 @@ SYSCALL_DEFINE2(memfd_create, if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; - if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { - pr_warn_once( - "%s[%d]: memfd_create() called without MFD_EXEC or MFD_NOEXEC_SEAL set\n", - current->comm, task_pid_nr(current)); - } - error = check_sysctl_memfd_noexec(&flags); if (error < 0) return error; -- cgit From be1ab60eb04f27e559d8be9655beb04d2bc90b25 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 11 Sep 2023 16:56:59 +0200 Subject: kmsan: simplify kmsan_internal_memmove_metadata() kmsan_internal_memmove_metadata() is the function that implements copying metadata every time memcpy()/memmove() is called. Because shadow memory stores 1 byte per each byte of kernel memory, copying the shadow is trivial and can be done by a single memmove() call. Origins, on the other hand, are stored as 4-byte values corresponding to every aligned 4 bytes of kernel memory. Therefore, if either the source or the destination of kmsan_internal_memmove_metadata() is unaligned, the number of origin slots corresponding to the source or destination may differ: 1) memcpy(0xffff888080a00000, 0xffff888080900000, 4) copies 1 origin slot into 1 origin slot: src (0xffff888080900000): xxxx src origins: o111 dst (0xffff888080a00000): xxxx dst origins: o111 2) memcpy(0xffff888080a00001, 0xffff888080900000, 4) copies 1 origin slot into 2 origin slots: src (0xffff888080900000): xxxx src origins: o111 dst (0xffff888080a00000): .xxx x... dst origins: o111 o111 3) memcpy(0xffff888080a00000, 0xffff888080900001, 4) copies 2 origin slots into 1 origin slot: src (0xffff888080900000): .xxx x... src origins: o111 o222 dst (0xffff888080a00000): xxxx dst origins: o111 (or o222) Previously, kmsan_internal_memmove_metadata() tried to solve this problem by copying min(src_slots, dst_slots) as is and cloning the missing slot on one of the ends, if needed. This was error-prone even in the simple cases where 4 bytes were copied, and did not account for situations where the total number of nonzero origin slots could have increased by more than one after copying: memcpy(0xffff888080a00000, 0xffff888080900002, 8) src (0xffff888080900002): ..xx .... xx.. src origins: o111 0000 o222 dst (0xffff888080a00000): xx.. ..xx o111 0000 (or 0000 o222) The new implementation simply copies the shadow byte by byte, and updates the corresponding origin slot, if the shadow byte is nonzero. This approach can handle complex cases with mixed initialized and uninitialized bytes. Similarly to KMSAN inline instrumentation, latter writes to bytes sharing the same origin slots take precedence. Link: https://lkml.kernel.org/r/20230911145702.2663753-1-glider@google.com Fixes: f80be4571b19 ("kmsan: add KMSAN runtime core") Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kmsan/core.c | 127 ++++++++++++++------------------------------------------ 1 file changed, 31 insertions(+), 96 deletions(-) (limited to 'mm') diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 3adb4c1d3b19..c19f47af0424 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -83,131 +83,66 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, /* Copy the metadata following the memmove() behavior. */ void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n) { + depot_stack_handle_t prev_old_origin = 0, prev_new_origin = 0; + int i, iter, step, src_off, dst_off, oiter_src, oiter_dst; depot_stack_handle_t old_origin = 0, new_origin = 0; - int src_slots, dst_slots, i, iter, step, skip_bits; depot_stack_handle_t *origin_src, *origin_dst; - void *shadow_src, *shadow_dst; - u32 *align_shadow_src, shadow; + u8 *shadow_src, *shadow_dst; + u32 *align_shadow_dst; bool backwards; shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW); if (!shadow_dst) return; KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n)); + align_shadow_dst = + (u32 *)ALIGN_DOWN((u64)shadow_dst, KMSAN_ORIGIN_SIZE); shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW); if (!shadow_src) { - /* - * @src is untracked: zero out destination shadow, ignore the - * origins, we're done. - */ - __memset(shadow_dst, 0, n); + /* @src is untracked: mark @dst as initialized. */ + kmsan_internal_unpoison_memory(dst, n, /*checked*/ false); return; } KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n)); - __memmove(shadow_dst, shadow_src, n); - origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN); origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN); KMSAN_WARN_ON(!origin_dst || !origin_src); - src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) - - ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) / - KMSAN_ORIGIN_SIZE; - dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) - - ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) / - KMSAN_ORIGIN_SIZE; - KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1)); - KMSAN_WARN_ON((src_slots - dst_slots > 1) || - (dst_slots - src_slots < -1)); backwards = dst > src; - i = backwards ? min(src_slots, dst_slots) - 1 : 0; - iter = backwards ? -1 : 1; - - align_shadow_src = - (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE); - for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) { - KMSAN_WARN_ON(i < 0); - shadow = align_shadow_src[i]; - if (i == 0) { - /* - * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't - * look at the first @src % KMSAN_ORIGIN_SIZE bytes - * of the first shadow slot. - */ - skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8; - shadow = (shadow >> skip_bits) << skip_bits; + step = backwards ? -1 : 1; + iter = backwards ? n - 1 : 0; + src_off = (u64)src % KMSAN_ORIGIN_SIZE; + dst_off = (u64)dst % KMSAN_ORIGIN_SIZE; + + /* Copy shadow bytes one by one, updating the origins if necessary. */ + for (i = 0; i < n; i++, iter += step) { + oiter_src = (iter + src_off) / KMSAN_ORIGIN_SIZE; + oiter_dst = (iter + dst_off) / KMSAN_ORIGIN_SIZE; + if (!shadow_src[iter]) { + shadow_dst[iter] = 0; + if (!align_shadow_dst[oiter_dst]) + origin_dst[oiter_dst] = 0; + continue; } - if (i == src_slots - 1) { - /* - * If @src + n isn't aligned on - * KMSAN_ORIGIN_SIZE, don't look at the last - * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the - * last shadow slot. - */ - skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8; - shadow = (shadow << skip_bits) >> skip_bits; - } - /* - * Overwrite the origin only if the corresponding - * shadow is nonempty. - */ - if (origin_src[i] && (origin_src[i] != old_origin) && shadow) { - old_origin = origin_src[i]; - new_origin = kmsan_internal_chain_origin(old_origin); + shadow_dst[iter] = shadow_src[iter]; + old_origin = origin_src[oiter_src]; + if (old_origin == prev_old_origin) + new_origin = prev_new_origin; + else { /* * kmsan_internal_chain_origin() may return * NULL, but we don't want to lose the previous * origin value. */ + new_origin = kmsan_internal_chain_origin(old_origin); if (!new_origin) new_origin = old_origin; } - if (shadow) - origin_dst[i] = new_origin; - else - origin_dst[i] = 0; - } - /* - * If dst_slots is greater than src_slots (i.e. - * dst_slots == src_slots + 1), there is an extra origin slot at the - * beginning or end of the destination buffer, for which we take the - * origin from the previous slot. - * This is only done if the part of the source shadow corresponding to - * slot is non-zero. - * - * E.g. if we copy 8 aligned bytes that are marked as uninitialized - * and have origins o111 and o222, to an unaligned buffer with offset 1, - * these two origins are copied to three origin slots, so one of then - * needs to be duplicated, depending on the copy direction (@backwards) - * - * src shadow: |uuuu|uuuu|....| - * src origin: |o111|o222|....| - * - * backwards = 0: - * dst shadow: |.uuu|uuuu|u...| - * dst origin: |....|o111|o222| - fill the empty slot with o111 - * backwards = 1: - * dst shadow: |.uuu|uuuu|u...| - * dst origin: |o111|o222|....| - fill the empty slot with o222 - */ - if (src_slots < dst_slots) { - if (backwards) { - shadow = align_shadow_src[src_slots - 1]; - skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8; - shadow = (shadow << skip_bits) >> skip_bits; - if (shadow) - /* src_slots > 0, therefore dst_slots is at least 2 */ - origin_dst[dst_slots - 1] = - origin_dst[dst_slots - 2]; - } else { - shadow = align_shadow_src[0]; - skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8; - shadow = (shadow >> skip_bits) << skip_bits; - if (shadow) - origin_dst[0] = origin_dst[1]; - } + origin_dst[oiter_dst] = new_origin; + prev_new_origin = new_origin; + prev_old_origin = old_origin; } } -- cgit From 0be7b2c232cfb1621660c78b0fd7260361c75e27 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 11 Sep 2023 16:57:00 +0200 Subject: kmsan: prevent optimizations in memcpy tests Clang 18 learned to optimize away memcpy() calls of small uninitialized scalar values. To ensure that memcpy tests in kmsan_test.c still perform calls to memcpy() (which KMSAN replaces with __msan_memcpy()), declare a separate memcpy_noinline() function with volatile parameters, which won't be optimized. Also retire DO_NOT_OPTIMIZE(), as memcpy_noinline() is apparently enough. Link: https://lkml.kernel.org/r/20230911145702.2663753-2-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kmsan/kmsan_test.c | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 312989aa2865..a8d4ca4a1066 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -407,33 +407,25 @@ static void test_printk(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } -/* - * Prevent the compiler from optimizing @var away. Without this, Clang may - * notice that @var is uninitialized and drop memcpy() calls that use it. - * - * There is OPTIMIZER_HIDE_VAR() in linux/compier.h that we cannot use here, - * because it is implemented as inline assembly receiving @var as a parameter - * and will enforce a KMSAN check. Same is true for e.g. barrier_data(var). - */ -#define DO_NOT_OPTIMIZE(var) barrier() +/* Prevent the compiler from inlining a memcpy() call. */ +static noinline void *memcpy_noinline(volatile void *dst, + const volatile void *src, size_t size) +{ + return memcpy((void *)dst, (const void *)src, size); +} -/* - * Test case: ensure that memcpy() correctly copies initialized values. - * Also serves as a regression test to ensure DO_NOT_OPTIMIZE() does not cause - * extra checks. - */ +/* Test case: ensure that memcpy() correctly copies initialized values. */ static void test_init_memcpy(struct kunit *test) { EXPECTATION_NO_REPORT(expect); - volatile int src; - volatile int dst = 0; + volatile long long src; + volatile long long dst = 0; - DO_NOT_OPTIMIZE(src); src = 1; kunit_info( test, "memcpy()ing aligned initialized src to aligned dst (no reports)\n"); - memcpy((void *)&dst, (void *)&src, sizeof(src)); + memcpy_noinline((void *)&dst, (void *)&src, sizeof(src)); kmsan_check_memory((void *)&dst, sizeof(dst)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } @@ -451,8 +443,7 @@ static void test_memcpy_aligned_to_aligned(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to aligned dst (UMR report)\n"); - DO_NOT_OPTIMIZE(uninit_src); - memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); + memcpy_noinline((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); kmsan_check_memory((void *)&dst, sizeof(dst)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } @@ -474,8 +465,9 @@ static void test_memcpy_aligned_to_unaligned(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n"); - DO_NOT_OPTIMIZE(uninit_src); - memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&uninit_src, sizeof(uninit_src)); + memcpy_noinline((void *)&dst[1], (void *)&uninit_src, + sizeof(uninit_src)); kmsan_check_memory((void *)dst, 4); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } @@ -498,8 +490,8 @@ static void test_memcpy_aligned_to_unaligned2(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); - DO_NOT_OPTIMIZE(uninit_src); - memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + memcpy_noinline((void *)&dst[1], (void *)&uninit_src, + sizeof(uninit_src)); kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } @@ -513,7 +505,6 @@ static void test_memcpy_aligned_to_unaligned2(struct kunit *test) \ kunit_info(test, \ "memset" #size "() should initialize memory\n"); \ - DO_NOT_OPTIMIZE(uninit); \ memset##size((uint##size##_t *)&uninit, 0, 1); \ kmsan_check_memory((void *)&uninit, sizeof(uninit)); \ KUNIT_EXPECT_TRUE(test, report_matches(&expect)); \ -- cgit From c3ab4873c8d3620493042123471e65b5e2c37b2c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 11 Sep 2023 16:57:01 +0200 Subject: kmsan: merge test_memcpy_aligned_to_unaligned{,2}() together Introduce report_reset() that allows checking for more than one KMSAN report per testcase. Fold test_memcpy_aligned_to_unaligned2() into test_memcpy_aligned_to_unaligned(), so that they share the setup phase and check the behavior of a single memcpy() call. Link: https://lkml.kernel.org/r/20230911145702.2663753-3-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kmsan/kmsan_test.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index a8d4ca4a1066..6eb1e1a4d08f 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -67,6 +67,17 @@ static bool report_available(void) return READ_ONCE(observed.available); } +/* Reset observed.available, so that the test can trigger another report. */ +static void report_reset(void) +{ + unsigned long flags; + + spin_lock_irqsave(&observed.lock, flags); + WRITE_ONCE(observed.available, false); + observed.ignore = false; + spin_unlock_irqrestore(&observed.lock, flags); +} + /* Information we expect in a report. */ struct expect_report { const char *error_type; /* Error type. */ @@ -454,7 +465,7 @@ static void test_memcpy_aligned_to_aligned(struct kunit *test) * * Copying aligned 4-byte value to an unaligned one leads to touching two * aligned 4-byte values. This test case checks that KMSAN correctly reports an - * error on the first of the two values. + * error on the mentioned two values. */ static void test_memcpy_aligned_to_unaligned(struct kunit *test) { @@ -470,28 +481,7 @@ static void test_memcpy_aligned_to_unaligned(struct kunit *test) sizeof(uninit_src)); kmsan_check_memory((void *)dst, 4); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); -} - -/* - * Test case: ensure that memcpy() correctly copies uninitialized values between - * aligned `src` and unaligned `dst`. - * - * Copying aligned 4-byte value to an unaligned one leads to touching two - * aligned 4-byte values. This test case checks that KMSAN correctly reports an - * error on the second of the two values. - */ -static void test_memcpy_aligned_to_unaligned2(struct kunit *test) -{ - EXPECTATION_UNINIT_VALUE_FN(expect, - "test_memcpy_aligned_to_unaligned2"); - volatile int uninit_src; - volatile char dst[8] = { 0 }; - - kunit_info( - test, - "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); - memcpy_noinline((void *)&dst[1], (void *)&uninit_src, - sizeof(uninit_src)); + report_reset(); kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } @@ -589,7 +579,6 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_init_memcpy), KUNIT_CASE(test_memcpy_aligned_to_aligned), KUNIT_CASE(test_memcpy_aligned_to_unaligned), - KUNIT_CASE(test_memcpy_aligned_to_unaligned2), KUNIT_CASE(test_memset16), KUNIT_CASE(test_memset32), KUNIT_CASE(test_memset64), -- cgit From 46fa84a2b920f91780db3347de81d65efb668301 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 11 Sep 2023 16:57:02 +0200 Subject: kmsan: introduce test_memcpy_initialized_gap() Add a regression test for the special case where memcpy() previously failed to correctly set the origins: if upon memcpy() four aligned initialized bytes with a zero origin value ended up split between two aligned four-byte chunks, one of those chunks could've received the zero origin value even despite it contained uninitialized bytes from other writes. Link: https://lkml.kernel.org/r/20230911145702.2663753-4-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Acked-by: Marco Elver Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kmsan/kmsan_test.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'mm') diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 6eb1e1a4d08f..07d3a3a5a9c5 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -486,6 +486,58 @@ static void test_memcpy_aligned_to_unaligned(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } +/* + * Test case: ensure that origin slots do not accidentally get overwritten with + * zeroes during memcpy(). + * + * Previously, when copying memory from an aligned buffer to an unaligned one, + * if there were zero origins corresponding to zero shadow values in the source + * buffer, they could have ended up being copied to nonzero shadow values in the + * destination buffer: + * + * memcpy(0xffff888080a00000, 0xffff888080900002, 8) + * + * src (0xffff888080900002): ..xx .... xx.. + * src origins: o111 0000 o222 + * dst (0xffff888080a00000): xx.. ..xx + * dst origins: o111 0000 + * (or 0000 o222) + * + * (here . stands for an initialized byte, and x for an uninitialized one. + * + * Ensure that this does not happen anymore, and for both destination bytes + * the origin is nonzero (i.e. KMSAN reports an error). + */ +static void test_memcpy_initialized_gap(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_initialized_gap"); + volatile char uninit_src[12]; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "unaligned 4-byte initialized value gets a nonzero origin after memcpy() - (2 UMR reports)\n"); + + uninit_src[0] = 42; + uninit_src[1] = 42; + uninit_src[4] = 42; + uninit_src[5] = 42; + uninit_src[6] = 42; + uninit_src[7] = 42; + uninit_src[10] = 42; + uninit_src[11] = 42; + memcpy_noinline((void *)&dst[0], (void *)&uninit_src[2], 8); + + kmsan_check_memory((void *)&dst[0], 4); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + report_reset(); + kmsan_check_memory((void *)&dst[2], 4); + KUNIT_EXPECT_FALSE(test, report_matches(&expect)); + report_reset(); + kmsan_check_memory((void *)&dst[4], 4); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + /* Generate test cases for memset16(), memset32(), memset64(). */ #define DEFINE_TEST_MEMSETXX(size) \ static void test_memset##size(struct kunit *test) \ @@ -579,6 +631,7 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_init_memcpy), KUNIT_CASE(test_memcpy_aligned_to_aligned), KUNIT_CASE(test_memcpy_aligned_to_unaligned), + KUNIT_CASE(test_memcpy_initialized_gap), KUNIT_CASE(test_memset16), KUNIT_CASE(test_memset32), KUNIT_CASE(test_memset64), -- cgit From 3ee0aa9f06756e959633ffda37856c6741d948ed Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:14 +0800 Subject: mm: move some shrinker-related function declarations to mm/internal.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "cleanups for lockless slab shrink", v4. This series is some cleanups for lockless slab shrink. This patch (of 4): The following functions are only used inside the mm subsystem, so it's better to move their declarations to the mm/internal.h file. 1. shrinker_debugfs_add() 2. shrinker_debugfs_detach() 3. shrinker_debugfs_remove() Link: https://lkml.kernel.org/r/20230911092517.64141-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911092517.64141-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Christian König Cc: Chuck Lever Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Daniel Vetter Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/internal.h | 26 ++++++++++++++++++++++++++ mm/shrinker_debug.c | 2 ++ 2 files changed, 28 insertions(+) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 30cf724ddbce..939d1227a527 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1154,4 +1154,30 @@ struct vma_prepare { struct vm_area_struct *remove; struct vm_area_struct *remove2; }; + +/* shrinker related functions */ + +#ifdef CONFIG_SHRINKER_DEBUG +extern int shrinker_debugfs_add(struct shrinker *shrinker); +extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id); +extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, + int debugfs_id); +#else /* CONFIG_SHRINKER_DEBUG */ +static inline int shrinker_debugfs_add(struct shrinker *shrinker) +{ + return 0; +} +static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id) +{ + *debugfs_id = -1; + return NULL; +} +static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, + int debugfs_id) +{ +} +#endif /* CONFIG_SHRINKER_DEBUG */ + #endif /* __MM_INTERNAL_H */ diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 3ab53fad8876..ee0cddb4530f 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -6,6 +6,8 @@ #include #include +#include "internal.h" + /* defined in vmscan.c */ extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; -- cgit From 96f7b2b9bbb1d680fab1fa96a3958b2d759b881f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:15 +0800 Subject: mm: vmscan: move shrinker-related code into a separate file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mm/vmscan.c file is too large, so separate the shrinker-related code from it into a separate file. No functional changes. Link: https://lkml.kernel.org/r/20230911092517.64141-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Christian König Cc: Chuck Lever Cc: Daniel Vetter Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/Makefile | 4 +- mm/internal.h | 2 + mm/shrinker.c | 709 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 701 --------------------------------------------------------- 4 files changed, 713 insertions(+), 703 deletions(-) create mode 100644 mm/shrinker.c (limited to 'mm') diff --git a/mm/Makefile b/mm/Makefile index ec65984e2ade..33873c8aedb3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -48,8 +48,8 @@ endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page-writeback.o folio-compat.o \ - readahead.o swap.o truncate.o vmscan.o shmem.o \ - util.o mmzone.o vmstat.o backing-dev.o \ + readahead.o swap.o truncate.o vmscan.o shrinker.o \ + shmem.o util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ compaction.o show_mem.o shmem_quota.o\ interval_tree.o list_lru.o workingset.o \ diff --git a/mm/internal.h b/mm/internal.h index 939d1227a527..0471d6326d01 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1156,6 +1156,8 @@ struct vma_prepare { }; /* shrinker related functions */ +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, + int priority); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); diff --git a/mm/shrinker.c b/mm/shrinker.c new file mode 100644 index 000000000000..043c87ccfab4 --- /dev/null +++ b/mm/shrinker.c @@ -0,0 +1,709 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include "internal.h" + +LIST_HEAD(shrinker_list); +DECLARE_RWSEM(shrinker_rwsem); + +#ifdef CONFIG_MEMCG +static int shrinker_nr_max; + +/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ +static inline int shrinker_map_size(int nr_items) +{ + return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); +} + +static inline int shrinker_defer_size(int nr_items) +{ + return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); +} + +void free_shrinker_info(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct shrinker_info *info; + int nid; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + info = rcu_dereference_protected(pn->shrinker_info, true); + kvfree(info); + rcu_assign_pointer(pn->shrinker_info, NULL); + } +} + +int alloc_shrinker_info(struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + int nid, size, ret = 0; + int map_size, defer_size = 0; + + down_write(&shrinker_rwsem); + map_size = shrinker_map_size(shrinker_nr_max); + defer_size = shrinker_defer_size(shrinker_nr_max); + size = map_size + defer_size; + for_each_node(nid) { + info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); + if (!info) { + free_shrinker_info(memcg); + ret = -ENOMEM; + break; + } + info->nr_deferred = (atomic_long_t *)(info + 1); + info->map = (void *)info->nr_deferred + defer_size; + info->map_nr_max = shrinker_nr_max; + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); + } + up_write(&shrinker_rwsem); + + return ret; +} + +static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, + int nid) +{ + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); +} + +static int expand_one_shrinker_info(struct mem_cgroup *memcg, + int map_size, int defer_size, + int old_map_size, int old_defer_size, + int new_nr_max) +{ + struct shrinker_info *new, *old; + struct mem_cgroup_per_node *pn; + int nid; + int size = map_size + defer_size; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + old = shrinker_info_protected(memcg, nid); + /* Not yet online memcg */ + if (!old) + return 0; + + /* Already expanded this shrinker_info */ + if (new_nr_max <= old->map_nr_max) + continue; + + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + if (!new) + return -ENOMEM; + + new->nr_deferred = (atomic_long_t *)(new + 1); + new->map = (void *)new->nr_deferred + defer_size; + new->map_nr_max = new_nr_max; + + /* map: set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_map_size); + memset((void *)new->map + old_map_size, 0, map_size - old_map_size); + /* nr_deferred: copy old values, clear all new values */ + memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); + memset((void *)new->nr_deferred + old_defer_size, 0, + defer_size - old_defer_size); + + rcu_assign_pointer(pn->shrinker_info, new); + kvfree_rcu(old, rcu); + } + + return 0; +} + +static int expand_shrinker_info(int new_id) +{ + int ret = 0; + int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); + int map_size, defer_size = 0; + int old_map_size, old_defer_size = 0; + struct mem_cgroup *memcg; + + if (!root_mem_cgroup) + goto out; + + lockdep_assert_held(&shrinker_rwsem); + + map_size = shrinker_map_size(new_nr_max); + defer_size = shrinker_defer_size(new_nr_max); + old_map_size = shrinker_map_size(shrinker_nr_max); + old_defer_size = shrinker_defer_size(shrinker_nr_max); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + ret = expand_one_shrinker_info(memcg, map_size, defer_size, + old_map_size, old_defer_size, + new_nr_max); + if (ret) { + mem_cgroup_iter_break(NULL, memcg); + goto out; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); +out: + if (!ret) + shrinker_nr_max = new_nr_max; + + return ret; +} + +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct shrinker_info *info; + + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, info->map); + } + rcu_read_unlock(); + } +} + +static DEFINE_IDR(shrinker_idr); + +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + int id, ret = -ENOMEM; + + if (mem_cgroup_disabled()) + return -ENOSYS; + + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); + if (id < 0) + goto unlock; + + if (id >= shrinker_nr_max) { + if (expand_shrinker_info(id)) { + idr_remove(&shrinker_idr, id); + goto unlock; + } + } + shrinker->id = id; + ret = 0; +unlock: + up_write(&shrinker_rwsem); + return ret; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ + int id = shrinker->id; + + BUG_ON(id < 0); + + lockdep_assert_held(&shrinker_rwsem); + + idr_remove(&shrinker_idr, id); +} + +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); +} + +void reparent_shrinker_deferred(struct mem_cgroup *memcg) +{ + int i, nid; + long nr; + struct mem_cgroup *parent; + struct shrinker_info *child_info, *parent_info; + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* Prevent from concurrent shrinker_info expand */ + down_read(&shrinker_rwsem); + for_each_node(nid) { + child_info = shrinker_info_protected(memcg, nid); + parent_info = shrinker_info_protected(parent, nid); + for (i = 0; i < child_info->map_nr_max; i++) { + nr = atomic_long_read(&child_info->nr_deferred[i]); + atomic_long_add(nr, &parent_info->nr_deferred[i]); + } + } + up_read(&shrinker_rwsem); +} +#else +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return -ENOSYS; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} + +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} +#endif /* CONFIG_MEMCG */ + +static long xchg_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return xchg_nr_deferred_memcg(nid, shrinker, + sc->memcg); + + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); +} + + +static long add_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return add_nr_deferred_memcg(nr, nid, shrinker, + sc->memcg); + + return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); +} + +#define SHRINK_BATCH 128 + +static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, + struct shrinker *shrinker, int priority) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long freeable; + long nr; + long new_nr; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + long scanned = 0, next_deferred; + + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0 || freeable == SHRINK_EMPTY) + return freeable; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = xchg_nr_deferred(shrinker, shrinkctl); + + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } + + total_scan = nr >> priority; + total_scan += delta; + total_scan = min(total_scan, (2 * freeable)); + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + freeable, delta, total_scan, priority); + + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= freeable) { + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + + shrinkctl->nr_to_scan = nr_to_scan; + shrinkctl->nr_scanned = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + + count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); + total_scan -= shrinkctl->nr_scanned; + scanned += shrinkctl->nr_scanned; + + cond_resched(); + } + + /* + * The deferred work is increased by any new work (delta) that wasn't + * done, decreased by old deferred work that was done now. + * + * And it is capped to two times of the freeable items. + */ + next_deferred = max_t(long, (nr + delta - scanned), 0); + next_deferred = min(next_deferred, (2 * freeable)); + + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. + */ + new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); + + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); + return freed; +} + +#ifdef CONFIG_MEMCG +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + struct shrinker_info *info; + unsigned long ret, freed = 0; + int i; + + if (!mem_cgroup_online(memcg)) + return 0; + + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); + if (unlikely(!info)) + goto unlock; + + for_each_set_bit(i, info->map, info->map_nr_max) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + + shrinker = idr_find(&shrinker_idr, i); + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { + if (!shrinker) + clear_bit(i, info->map); + continue; + } + + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_online() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) { + clear_bit(i, info->map); + /* + * After the shrinker reported that it had no objects to + * free, but before we cleared the corresponding bit in + * the memcg shrinker map, a new object might have been + * added. To make sure, we have the bit set in this + * case, we invoke the shrinker one more time and reset + * the bit if it reports that it is not empty anymore. + * The memory barrier here pairs with the barrier in + * set_shrinker_bit(): + * + * list_lru_add() shrink_slab_memcg() + * list_add_tail() clear_bit() + * + * set_bit() do_shrink_slab() + */ + smp_mb__after_atomic(); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + else + set_shrinker_bit(memcg, nid, i); + } + freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } +unlock: + up_read(&shrinker_rwsem); + return freed; +} +#else /* !CONFIG_MEMCG */ +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + return 0; +} +#endif /* CONFIG_MEMCG */ + +/** + * shrink_slab - shrink slab caches + * @gfp_mask: allocation context + * @nid: node whose slab caches to target + * @memcg: memory cgroup whose slab caches to target + * @priority: the reclaim priority + * + * Call the shrink functions to age shrinkable caches. + * + * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, + * unaware shrinkers will receive a node id of 0 instead. + * + * @memcg specifies the memory cgroup to target. Unaware shrinkers + * are called only if it is the root cgroup. + * + * @priority is sc->priority, we take the number of objects and >> by priority + * in order to get the scan target. + * + * Returns the number of reclaimed slab objects. + */ +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, + int priority) +{ + unsigned long ret, freed = 0; + struct shrinker *shrinker; + + /* + * The root memcg might be allocated even though memcg is disabled + * via "cgroup_disable=memory" boot parameter. This could make + * mem_cgroup_is_root() return false, then just run memcg slab + * shrink, but skip global shrink. This may result in premature + * oom. + */ + if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) + return shrink_slab_memcg(gfp_mask, nid, memcg, priority); + + if (!down_read_trylock(&shrinker_rwsem)) + goto out; + + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + freed += ret; + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } + + up_read(&shrinker_rwsem); +out: + cond_resched(); + return freed; +} + +/* + * Add a shrinker callback to be called from the vm. + */ +static int __prealloc_shrinker(struct shrinker *shrinker) +{ + unsigned int size; + int err; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err != -ENOSYS) + return err; + + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + } + + size = sizeof(*shrinker->nr_deferred); + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + + return 0; +} + +#ifdef CONFIG_SHRINKER_DEBUG +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __prealloc_shrinker(shrinker); + if (err) { + kfree_const(shrinker->name); + shrinker->name = NULL; + } + + return err; +} +#else +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __prealloc_shrinker(shrinker); +} +#endif + +void free_prealloced_shrinker(struct shrinker *shrinker) +{ +#ifdef CONFIG_SHRINKER_DEBUG + kfree_const(shrinker->name); + shrinker->name = NULL; +#endif + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + down_write(&shrinker_rwsem); + unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + return; + } + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; +} + +void register_shrinker_prepared(struct shrinker *shrinker) +{ + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + shrinker->flags |= SHRINKER_REGISTERED; + shrinker_debugfs_add(shrinker); + up_write(&shrinker_rwsem); +} + +static int __register_shrinker(struct shrinker *shrinker) +{ + int err = __prealloc_shrinker(shrinker); + + if (err) + return err; + register_shrinker_prepared(shrinker); + return 0; +} + +#ifdef CONFIG_SHRINKER_DEBUG +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __register_shrinker(shrinker); + if (err) { + kfree_const(shrinker->name); + shrinker->name = NULL; + } + return err; +} +#else +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __register_shrinker(shrinker); +} +#endif +EXPORT_SYMBOL(register_shrinker); + +/* + * Remove one + */ +void unregister_shrinker(struct shrinker *shrinker) +{ + struct dentry *debugfs_entry; + int debugfs_id; + + if (!(shrinker->flags & SHRINKER_REGISTERED)) + return; + + down_write(&shrinker_rwsem); + list_del(&shrinker->list); + shrinker->flags &= ~SHRINKER_REGISTERED; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); + up_write(&shrinker_rwsem); + + shrinker_debugfs_remove(debugfs_entry, debugfs_id); + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; +} +EXPORT_SYMBOL(unregister_shrinker); + +/** + * synchronize_shrinkers - Wait for all running shrinkers to complete. + * + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. + */ +void synchronize_shrinkers(void) +{ + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); +} +EXPORT_SYMBOL(synchronize_shrinkers); diff --git a/mm/vmscan.c b/mm/vmscan.c index 4062bcca5311..4a6551510b65 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -188,246 +187,7 @@ struct scan_control { */ int vm_swappiness = 60; -LIST_HEAD(shrinker_list); -DECLARE_RWSEM(shrinker_rwsem); - #ifdef CONFIG_MEMCG -static int shrinker_nr_max; - -/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ -static inline int shrinker_map_size(int nr_items) -{ - return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); -} - -static inline int shrinker_defer_size(int nr_items) -{ - return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); -} - -static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, - int nid) -{ - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - lockdep_is_held(&shrinker_rwsem)); -} - -static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int map_size, int defer_size, - int old_map_size, int old_defer_size, - int new_nr_max) -{ - struct shrinker_info *new, *old; - struct mem_cgroup_per_node *pn; - int nid; - int size = map_size + defer_size; - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - old = shrinker_info_protected(memcg, nid); - /* Not yet online memcg */ - if (!old) - return 0; - - /* Already expanded this shrinker_info */ - if (new_nr_max <= old->map_nr_max) - continue; - - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); - if (!new) - return -ENOMEM; - - new->nr_deferred = (atomic_long_t *)(new + 1); - new->map = (void *)new->nr_deferred + defer_size; - new->map_nr_max = new_nr_max; - - /* map: set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_map_size); - memset((void *)new->map + old_map_size, 0, map_size - old_map_size); - /* nr_deferred: copy old values, clear all new values */ - memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); - memset((void *)new->nr_deferred + old_defer_size, 0, - defer_size - old_defer_size); - - rcu_assign_pointer(pn->shrinker_info, new); - kvfree_rcu(old, rcu); - } - - return 0; -} - -void free_shrinker_info(struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *pn; - struct shrinker_info *info; - int nid; - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - info = rcu_dereference_protected(pn->shrinker_info, true); - kvfree(info); - rcu_assign_pointer(pn->shrinker_info, NULL); - } -} - -int alloc_shrinker_info(struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - int nid, size, ret = 0; - int map_size, defer_size = 0; - - down_write(&shrinker_rwsem); - map_size = shrinker_map_size(shrinker_nr_max); - defer_size = shrinker_defer_size(shrinker_nr_max); - size = map_size + defer_size; - for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); - if (!info) { - free_shrinker_info(memcg); - ret = -ENOMEM; - break; - } - info->nr_deferred = (atomic_long_t *)(info + 1); - info->map = (void *)info->nr_deferred + defer_size; - info->map_nr_max = shrinker_nr_max; - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); - } - up_write(&shrinker_rwsem); - - return ret; -} - -static int expand_shrinker_info(int new_id) -{ - int ret = 0; - int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); - int map_size, defer_size = 0; - int old_map_size, old_defer_size = 0; - struct mem_cgroup *memcg; - - if (!root_mem_cgroup) - goto out; - - lockdep_assert_held(&shrinker_rwsem); - - map_size = shrinker_map_size(new_nr_max); - defer_size = shrinker_defer_size(new_nr_max); - old_map_size = shrinker_map_size(shrinker_nr_max); - old_defer_size = shrinker_defer_size(shrinker_nr_max); - - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - ret = expand_one_shrinker_info(memcg, map_size, defer_size, - old_map_size, old_defer_size, - new_nr_max); - if (ret) { - mem_cgroup_iter_break(NULL, memcg); - goto out; - } - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); -out: - if (!ret) - shrinker_nr_max = new_nr_max; - - return ret; -} - -void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) -{ - if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct shrinker_info *info; - - rcu_read_lock(); - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); - if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { - /* Pairs with smp mb in shrink_slab() */ - smp_mb__before_atomic(); - set_bit(shrinker_id, info->map); - } - rcu_read_unlock(); - } -} - -static DEFINE_IDR(shrinker_idr); - -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - int id, ret = -ENOMEM; - - if (mem_cgroup_disabled()) - return -ENOSYS; - - down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); - if (id < 0) - goto unlock; - - if (id >= shrinker_nr_max) { - if (expand_shrinker_info(id)) { - idr_remove(&shrinker_idr, id); - goto unlock; - } - } - shrinker->id = id; - ret = 0; -unlock: - up_write(&shrinker_rwsem); - return ret; -} - -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ - int id = shrinker->id; - - BUG_ON(id < 0); - - lockdep_assert_held(&shrinker_rwsem); - - idr_remove(&shrinker_idr, id); -} - -static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - - info = shrinker_info_protected(memcg, nid); - return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); -} - -static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - - info = shrinker_info_protected(memcg, nid); - return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); -} - -void reparent_shrinker_deferred(struct mem_cgroup *memcg) -{ - int i, nid; - long nr; - struct mem_cgroup *parent; - struct shrinker_info *child_info, *parent_info; - - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - - /* Prevent from concurrent shrinker_info expand */ - down_read(&shrinker_rwsem); - for_each_node(nid) { - child_info = shrinker_info_protected(memcg, nid); - parent_info = shrinker_info_protected(parent, nid); - for (i = 0; i < child_info->map_nr_max; i++) { - nr = atomic_long_read(&child_info->nr_deferred[i]); - atomic_long_add(nr, &parent_info->nr_deferred[i]); - } - } - up_read(&shrinker_rwsem); -} /* Returns true for reclaim through cgroup limits or cgroup interfaces. */ static bool cgroup_reclaim(struct scan_control *sc) @@ -468,27 +228,6 @@ static bool writeback_throttling_sane(struct scan_control *sc) return false; } #else -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - return -ENOSYS; -} - -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ -} - -static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - return 0; -} - -static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - return 0; -} - static bool cgroup_reclaim(struct scan_control *sc) { return false; @@ -557,39 +296,6 @@ static void flush_reclaim_state(struct scan_control *sc) } } -static long xchg_nr_deferred(struct shrinker *shrinker, - struct shrink_control *sc) -{ - int nid = sc->nid; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - - if (sc->memcg && - (shrinker->flags & SHRINKER_MEMCG_AWARE)) - return xchg_nr_deferred_memcg(nid, shrinker, - sc->memcg); - - return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); -} - - -static long add_nr_deferred(long nr, struct shrinker *shrinker, - struct shrink_control *sc) -{ - int nid = sc->nid; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - - if (sc->memcg && - (shrinker->flags & SHRINKER_MEMCG_AWARE)) - return add_nr_deferred_memcg(nr, nid, shrinker, - sc->memcg); - - return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); -} - static bool can_demote(int nid, struct scan_control *sc) { if (!numa_demotion_enabled) @@ -671,413 +377,6 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, return size; } -/* - * Add a shrinker callback to be called from the vm. - */ -static int __prealloc_shrinker(struct shrinker *shrinker) -{ - unsigned int size; - int err; - - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - err = prealloc_memcg_shrinker(shrinker); - if (err != -ENOSYS) - return err; - - shrinker->flags &= ~SHRINKER_MEMCG_AWARE; - } - - size = sizeof(*shrinker->nr_deferred); - if (shrinker->flags & SHRINKER_NUMA_AWARE) - size *= nr_node_ids; - - shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); - if (!shrinker->nr_deferred) - return -ENOMEM; - - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __prealloc_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - - return err; -} -#else -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __prealloc_shrinker(shrinker); -} -#endif - -void free_prealloced_shrinker(struct shrinker *shrinker) -{ -#ifdef CONFIG_SHRINKER_DEBUG - kfree_const(shrinker->name); - shrinker->name = NULL; -#endif - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - down_write(&shrinker_rwsem); - unregister_memcg_shrinker(shrinker); - up_write(&shrinker_rwsem); - return; - } - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} - -void register_shrinker_prepared(struct shrinker *shrinker) -{ - down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); - shrinker->flags |= SHRINKER_REGISTERED; - shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); -} - -static int __register_shrinker(struct shrinker *shrinker) -{ - int err = __prealloc_shrinker(shrinker); - - if (err) - return err; - register_shrinker_prepared(shrinker); - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __register_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - return err; -} -#else -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __register_shrinker(shrinker); -} -#endif -EXPORT_SYMBOL(register_shrinker); - -/* - * Remove one - */ -void unregister_shrinker(struct shrinker *shrinker) -{ - struct dentry *debugfs_entry; - int debugfs_id; - - if (!(shrinker->flags & SHRINKER_REGISTERED)) - return; - - down_write(&shrinker_rwsem); - list_del(&shrinker->list); - shrinker->flags &= ~SHRINKER_REGISTERED; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); - debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - up_write(&shrinker_rwsem); - - shrinker_debugfs_remove(debugfs_entry, debugfs_id); - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} -EXPORT_SYMBOL(unregister_shrinker); - -/** - * synchronize_shrinkers - Wait for all running shrinkers to complete. - * - * This is equivalent to calling unregister_shrink() and register_shrinker(), - * but atomically and with less overhead. This is useful to guarantee that all - * shrinker invocations have seen an update, before freeing memory, similar to - * rcu. - */ -void synchronize_shrinkers(void) -{ - down_write(&shrinker_rwsem); - up_write(&shrinker_rwsem); -} -EXPORT_SYMBOL(synchronize_shrinkers); - -#define SHRINK_BATCH 128 - -static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - struct shrinker *shrinker, int priority) -{ - unsigned long freed = 0; - unsigned long long delta; - long total_scan; - long freeable; - long nr; - long new_nr; - long batch_size = shrinker->batch ? shrinker->batch - : SHRINK_BATCH; - long scanned = 0, next_deferred; - - freeable = shrinker->count_objects(shrinker, shrinkctl); - if (freeable == 0 || freeable == SHRINK_EMPTY) - return freeable; - - /* - * copy the current shrinker scan count into a local variable - * and zero it so that other concurrent shrinker invocations - * don't also do this scanning work. - */ - nr = xchg_nr_deferred(shrinker, shrinkctl); - - if (shrinker->seeks) { - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); - } else { - /* - * These objects don't require any IO to create. Trim - * them aggressively under memory pressure to keep - * them from causing refetches in the IO caches. - */ - delta = freeable / 2; - } - - total_scan = nr >> priority; - total_scan += delta; - total_scan = min(total_scan, (2 * freeable)); - - trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - freeable, delta, total_scan, priority); - - /* - * Normally, we should not scan less than batch_size objects in one - * pass to avoid too frequent shrinker calls, but if the slab has less - * than batch_size objects in total and we are really tight on memory, - * we will try to reclaim all available objects, otherwise we can end - * up failing allocations although there are plenty of reclaimable - * objects spread over several slabs with usage less than the - * batch_size. - * - * We detect the "tight on memory" situations by looking at the total - * number of objects we want to scan (total_scan). If it is greater - * than the total number of objects on slab (freeable), we must be - * scanning at high prio and therefore should try to reclaim as much as - * possible. - */ - while (total_scan >= batch_size || - total_scan >= freeable) { - unsigned long ret; - unsigned long nr_to_scan = min(batch_size, total_scan); - - shrinkctl->nr_to_scan = nr_to_scan; - shrinkctl->nr_scanned = nr_to_scan; - ret = shrinker->scan_objects(shrinker, shrinkctl); - if (ret == SHRINK_STOP) - break; - freed += ret; - - count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); - total_scan -= shrinkctl->nr_scanned; - scanned += shrinkctl->nr_scanned; - - cond_resched(); - } - - /* - * The deferred work is increased by any new work (delta) that wasn't - * done, decreased by old deferred work that was done now. - * - * And it is capped to two times of the freeable items. - */ - next_deferred = max_t(long, (nr + delta - scanned), 0); - next_deferred = min(next_deferred, (2 * freeable)); - - /* - * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. - */ - new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - - trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); - return freed; -} - -#ifdef CONFIG_MEMCG -static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, int priority) -{ - struct shrinker_info *info; - unsigned long ret, freed = 0; - int i; - - if (!mem_cgroup_online(memcg)) - return 0; - - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); - if (unlikely(!info)) - goto unlock; - - for_each_set_bit(i, info->map, info->map_nr_max) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - struct shrinker *shrinker; - - shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { - if (!shrinker) - clear_bit(i, info->map); - continue; - } - - /* Call non-slab shrinkers even though kmem is disabled */ - if (!memcg_kmem_online() && - !(shrinker->flags & SHRINKER_NONSLAB)) - continue; - - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) { - clear_bit(i, info->map); - /* - * After the shrinker reported that it had no objects to - * free, but before we cleared the corresponding bit in - * the memcg shrinker map, a new object might have been - * added. To make sure, we have the bit set in this - * case, we invoke the shrinker one more time and reset - * the bit if it reports that it is not empty anymore. - * The memory barrier here pairs with the barrier in - * set_shrinker_bit(): - * - * list_lru_add() shrink_slab_memcg() - * list_add_tail() clear_bit() - * - * set_bit() do_shrink_slab() - */ - smp_mb__after_atomic(); - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - else - set_shrinker_bit(memcg, nid, i); - } - freed += ret; - - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } - } -unlock: - up_read(&shrinker_rwsem); - return freed; -} -#else /* CONFIG_MEMCG */ -static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, int priority) -{ - return 0; -} -#endif /* CONFIG_MEMCG */ - -/** - * shrink_slab - shrink slab caches - * @gfp_mask: allocation context - * @nid: node whose slab caches to target - * @memcg: memory cgroup whose slab caches to target - * @priority: the reclaim priority - * - * Call the shrink functions to age shrinkable caches. - * - * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, - * unaware shrinkers will receive a node id of 0 instead. - * - * @memcg specifies the memory cgroup to target. Unaware shrinkers - * are called only if it is the root cgroup. - * - * @priority is sc->priority, we take the number of objects and >> by priority - * in order to get the scan target. - * - * Returns the number of reclaimed slab objects. - */ -static unsigned long shrink_slab(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, - int priority) -{ - unsigned long ret, freed = 0; - struct shrinker *shrinker; - - /* - * The root memcg might be allocated even though memcg is disabled - * via "cgroup_disable=memory" boot parameter. This could make - * mem_cgroup_is_root() return false, then just run memcg slab - * shrink, but skip global shrink. This may result in premature - * oom. - */ - if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) - return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - - if (!down_read_trylock(&shrinker_rwsem)) - goto out; - - list_for_each_entry(shrinker, &shrinker_list, list) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - freed += ret; - /* - * Bail out if someone want to register a new shrinker to - * prevent the registration from being stalled for long periods - * by parallel ongoing shrinking. - */ - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } - } - - up_read(&shrinker_rwsem); -out: - cond_resched(); - return freed; -} - static unsigned long drop_slab_node(int nid) { unsigned long freed = 0; -- cgit From 1dd49e58f966b1eecd935dc28458a8369ae94ad1 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:16 +0800 Subject: mm: shrinker: remove redundant shrinker_rwsem in debugfs operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit debugfs_remove_recursive() will wait for debugfs_file_put() to return, so the shrinker will not be freed when doing debugfs operations (such as shrinker_debugfs_count_show() and shrinker_debugfs_scan_write()), so there is no need to hold shrinker_rwsem during debugfs operations. Link: https://lkml.kernel.org/r/20230911092517.64141-4-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Christian König Cc: Chuck Lever Cc: Daniel Vetter Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker_debug.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index ee0cddb4530f..e4ce509f619e 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -51,17 +51,12 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) struct mem_cgroup *memcg; unsigned long total; bool memcg_aware; - int ret, nid; + int ret = 0, nid; count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!count_per_node) return -ENOMEM; - ret = down_read_killable(&shrinker_rwsem); - if (ret) { - kfree(count_per_node); - return ret; - } rcu_read_lock(); memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; @@ -94,7 +89,6 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); rcu_read_unlock(); - up_read(&shrinker_rwsem); kfree(count_per_node); return ret; @@ -119,7 +113,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, struct mem_cgroup *memcg = NULL; int nid; char kbuf[72]; - ssize_t ret; read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) @@ -148,12 +141,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EINVAL; } - ret = down_read_killable(&shrinker_rwsem); - if (ret) { - mem_cgroup_put(memcg); - return ret; - } - sc.nid = nid; sc.memcg = memcg; sc.nr_to_scan = nr_to_scan; @@ -161,7 +148,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, shrinker->scan_objects(shrinker, &sc); - up_read(&shrinker_rwsem); mem_cgroup_put(memcg); return size; -- cgit From 0b2f5ea1aa39c0ed34bdadb53faf519e3d84ac4a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:17 +0800 Subject: drm/ttm: introduce pool_shrink_rwsem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, synchronize_shrinkers() is only used by TTM pool. It only requires that no shrinkers run in parallel. After we use RCU+refcount method to implement the lockless slab shrink, we can not use shrinker_rwsem or synchronize_rcu() to guarantee that all shrinker invocations have seen an update before freeing memory. So we introduce a new pool_shrink_rwsem to implement a private ttm_pool_synchronize_shrinkers(), so as to achieve the same purpose. Link: https://lkml.kernel.org/r/20230911092517.64141-5-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Christian König Acked-by: Daniel Vetter Cc: Christian Brauner Cc: Chuck Lever Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'mm') diff --git a/mm/shrinker.c b/mm/shrinker.c index 043c87ccfab4..a16cd448b924 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -692,18 +692,3 @@ void unregister_shrinker(struct shrinker *shrinker) shrinker->nr_deferred = NULL; } EXPORT_SYMBOL(unregister_shrinker); - -/** - * synchronize_shrinkers - Wait for all running shrinkers to complete. - * - * This is equivalent to calling unregister_shrink() and register_shrinker(), - * but atomically and with less overhead. This is useful to guarantee that all - * shrinker invocations have seen an update, before freeing memory, similar to - * rcu. - */ -void synchronize_shrinkers(void) -{ - down_write(&shrinker_rwsem); - up_write(&shrinker_rwsem); -} -EXPORT_SYMBOL(synchronize_shrinkers); -- cgit From c42d50aefd17a6bad3ed617769edbbb579137545 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:00 +0800 Subject: mm: shrinker: add infrastructure for dynamically allocating shrinker Patch series "use refcount+RCU method to implement lockless slab shrink", v6. 1. Background ============= We used to implement the lockless slab shrink with SRCU [1], but then kernel test robot reported -88.8% regression in stress-ng.ramfs.ops_per_sec test case [2], so we reverted it [3]. This patch series aims to re-implement the lockless slab shrink using the refcount+RCU method proposed by Dave Chinner [4]. [1]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/ [2]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [3]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/ [4]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ 2. Implementation ================= Currently, the shrinker instances can be divided into the following three types: a) global shrinker instance statically defined in the kernel, such as workingset_shadow_shrinker. b) global shrinker instance statically defined in the kernel modules, such as mmu_shrinker in x86. c) shrinker instance embedded in other structures. For case a, the memory of shrinker instance is never freed. For case b, the memory of shrinker instance will be freed after synchronize_rcu() when the module is unloaded. For case c, the memory of shrinker instance will be freed along with the structure it is embedded in. In preparation for implementing lockless slab shrink, we need to dynamically allocate those shrinker instances in case c, then the memory can be dynamically freed alone by calling kfree_rcu(). This patchset adds the following new APIs for dynamically allocating shrinker, and add a private_data field to struct shrinker to record and get the original embedded structure. 1. shrinker_alloc() 2. shrinker_register() 3. shrinker_free() In order to simplify shrinker-related APIs and make shrinker more independent of other kernel mechanisms, this patchset uses the above APIs to convert all shrinkers (including case a and b) to dynamically allocated, and then remove all existing APIs. This will also have another advantage mentioned by Dave Chinner: ``` The other advantage of this is that it will break all the existing out of tree code and third party modules using the old API and will no longer work with a kernel using lockless slab shrinkers. They need to break (both at the source and binary levels) to stop bad things from happening due to using uncoverted shrinkers in the new setup. ``` Then we free the shrinker by calling call_rcu(), and use rcu_read_{lock,unlock}() to ensure that the shrinker instance is valid. And the shrinker::refcount mechanism ensures that the shrinker instance will not be run again after unregistration. So the structure that records the pointer of shrinker instance can be safely freed without waiting for the RCU read-side critical section. In this way, while we implement the lockless slab shrink, we don't need to be blocked in unregister_shrinker() to wait RCU read-side critical section. PATCH 1: introduce new APIs PATCH 2~38: convert all shrinnkers to use new APIs PATCH 39: remove old APIs PATCH 40~41: some cleanups and preparations PATCH 42-43: implement the lockless slab shrink PATCH 44~45: convert shrinker_rwsem to mutex 3. Testing ========== 3.1 slab shrink stress test --------------------------- We can reproduce the down_read_trylock() hotspot through the following script: ``` DIR="/root/shrinker/memcg/mnt" do_create() { mkdir -p /sys/fs/cgroup/memory/test echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes for i in `seq 0 $1`; do mkdir -p /sys/fs/cgroup/memory/test/$i; echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; mkdir -p $DIR/$i; done } do_mount() { for i in `seq $1 $2`; do mount -t tmpfs $i $DIR/$i; done } do_touch() { for i in `seq $1 $2`; do echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & done } case "$1" in touch) do_touch $2 $3 ;; test) do_create 4000 do_mount 0 4000 do_touch 0 3000 ;; *) exit 1 ;; esac ``` Save the above script, then run test and touch commands. Then we can use the following perf command to view hotspots: perf top -U -F 999 1) Before applying this patchset: 33.15% [kernel] [k] down_read_trylock 25.38% [kernel] [k] shrink_slab 21.75% [kernel] [k] up_read 4.45% [kernel] [k] _find_next_bit 2.27% [kernel] [k] do_shrink_slab 1.80% [kernel] [k] intel_idle_irq 1.79% [kernel] [k] shrink_lruvec 0.67% [kernel] [k] xas_descend 0.41% [kernel] [k] mem_cgroup_iter 0.40% [kernel] [k] shrink_node 0.38% [kernel] [k] list_lru_count_one 2) After applying this patchset: 64.56% [kernel] [k] shrink_slab 12.18% [kernel] [k] do_shrink_slab 3.30% [kernel] [k] __rcu_read_unlock 2.61% [kernel] [k] shrink_lruvec 2.49% [kernel] [k] __rcu_read_lock 1.93% [kernel] [k] intel_idle_irq 0.89% [kernel] [k] shrink_node 0.81% [kernel] [k] mem_cgroup_iter 0.77% [kernel] [k] mem_cgroup_calculate_protection 0.66% [kernel] [k] list_lru_count_one We can see that the first perf hotspot becomes shrink_slab, which is what we expect. 3.2 registration and unregistration stress test ----------------------------------------------- Run the command below to test: stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 & 1) Before applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 473062 60.00 8.00 279.13 7884.12 1647.59 for a 60.01s run time: 1440.34s available CPU time 7.99s user time ( 0.55%) 279.13s system time ( 19.38%) 287.12s total time ( 19.93%) load average: 7.12 2.99 1.15 successful run completed in 60.01s (1 min, 0.01 secs) 2) After applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 477165 60.00 8.13 281.34 7952.55 1648.40 for a 60.01s run time: 1440.33s available CPU time 8.12s user time ( 0.56%) 281.34s system time ( 19.53%) 289.46s total time ( 20.10%) load average: 6.98 3.03 1.19 successful run completed in 60.01s (1 min, 0.01 secs) We can see that the ops/s has hardly changed. This patch (of 45): Currently, the shrinker instances can be divided into the following three types: a) global shrinker instance statically defined in the kernel, such as workingset_shadow_shrinker. b) global shrinker instance statically defined in the kernel modules, such as mmu_shrinker in x86. c) shrinker instance embedded in other structures. For case a, the memory of shrinker instance is never freed. For case b, the memory of shrinker instance will be freed after synchronize_rcu() when the module is unloaded. For case c, the memory of shrinker instance will be freed along with the structure it is embedded in. In preparation for implementing lockless slab shrink, we need to dynamically allocate those shrinker instances in case c, then the memory can be dynamically freed alone by calling kfree_rcu(). So this commit adds the following new APIs for dynamically allocating shrinker, and add a private_data field to struct shrinker to record and get the original embedded structure. 1. shrinker_alloc() Used to allocate shrinker instance itself and related memory, it will return a pointer to the shrinker instance on success and NULL on failure. 2. shrinker_register() Used to register the shrinker instance, which is same as the current register_shrinker_prepared(). 3. shrinker_free() Used to unregister (if needed) and free the shrinker instance. In order to simplify shrinker-related APIs and make shrinker more independent of other kernel mechanisms, subsequent submissions will use the above API to convert all shrinkers (including case a and b) to dynamically allocated, and then remove all existing APIs. This will also have another advantage mentioned by Dave Chinner: ``` The other advantage of this is that it will break all the existing out of tree code and third party modules using the old API and will no longer work with a kernel using lockless slab shrinkers. They need to break (both at the source and binary levels) to stop bad things from happening due to using unconverted shrinkers in the new setup. ``` [zhengqi.arch@bytedance.com: mm: shrinker: some cleanup] Link: https://lkml.kernel.org/r/20230919024607.65463-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911094444.68966-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911094444.68966-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Koenig Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/internal.h | 22 +++++++++++ mm/shrinker.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/shrinker_debug.c | 3 -- 3 files changed, 128 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 0471d6326d01..a273f4d948d8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1160,6 +1160,20 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG +static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, + const char *fmt, va_list ap) +{ + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + + return shrinker->name ? 0 : -ENOMEM; +} + +static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) +{ + kfree_const(shrinker->name); + shrinker->name = NULL; +} + extern int shrinker_debugfs_add(struct shrinker *shrinker); extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id); @@ -1170,6 +1184,14 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } +static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, + const char *fmt, va_list ap) +{ + return 0; +} +static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) +{ +} static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id) { diff --git a/mm/shrinker.c b/mm/shrinker.c index a16cd448b924..d1032a4d5684 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -550,6 +550,112 @@ out: return freed; } +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) +{ + struct shrinker *shrinker; + unsigned int size; + va_list ap; + int err; + + shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL); + if (!shrinker) + return NULL; + + va_start(ap, fmt); + err = shrinker_debugfs_name_alloc(shrinker, fmt, ap); + va_end(ap); + if (err) + goto err_name; + + shrinker->flags = flags | SHRINKER_ALLOCATED; + shrinker->seeks = DEFAULT_SEEKS; + + if (flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err == -ENOSYS) { + /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + goto non_memcg; + } + + if (err) + goto err_flags; + + return shrinker; + } + +non_memcg: + /* + * The nr_deferred is available on per memcg level for memcg aware + * shrinkers, so only allocate nr_deferred in the following cases: + * - non-memcg-aware shrinkers + * - !CONFIG_MEMCG + * - memcg is disabled by kernel command line + */ + size = sizeof(*shrinker->nr_deferred); + if (flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + goto err_flags; + + return shrinker; + +err_flags: + shrinker_debugfs_name_free(shrinker); +err_name: + kfree(shrinker); + return NULL; +} +EXPORT_SYMBOL_GPL(shrinker_alloc); + +void shrinker_register(struct shrinker *shrinker) +{ + if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) { + pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker"); + return; + } + + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + shrinker->flags |= SHRINKER_REGISTERED; + shrinker_debugfs_add(shrinker); + up_write(&shrinker_rwsem); +} +EXPORT_SYMBOL_GPL(shrinker_register); + +void shrinker_free(struct shrinker *shrinker) +{ + struct dentry *debugfs_entry = NULL; + int debugfs_id; + + if (!shrinker) + return; + + down_write(&shrinker_rwsem); + if (shrinker->flags & SHRINKER_REGISTERED) { + list_del(&shrinker->list); + debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); + shrinker->flags &= ~SHRINKER_REGISTERED; + } + + shrinker_debugfs_name_free(shrinker); + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + + if (debugfs_entry) + shrinker_debugfs_remove(debugfs_entry, debugfs_id); + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; + + kfree(shrinker); +} +EXPORT_SYMBOL_GPL(shrinker_free); + /* * Add a shrinker callback to be called from the vm. */ diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index e4ce509f619e..24aebe7c24cc 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -241,9 +241,6 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, lockdep_assert_held(&shrinker_rwsem); - kfree_const(shrinker->name); - shrinker->name = NULL; - *debugfs_id = entry ? shrinker->debugfs_id : -1; shrinker->debugfs_entry = NULL; -- cgit From 54d917295b8366545e6ee940c93dffe1452e6480 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:16 +0800 Subject: mm: thp: dynamically allocate the thp-related shrinkers Use new APIs to dynamically allocate the thp-zero and thp-deferred_split shrinkers. Link: https://lkml.kernel.org/r/20230911094444.68966-18-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/huge_memory.c | 67 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 064fbd90822b..1cfd83e91748 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -65,7 +65,11 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<count_objects = shrink_huge_zero_page_count; + huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; + shrinker_register(huge_zero_page_shrinker); + + deferred_split_shrinker->count_objects = deferred_split_count; + deferred_split_shrinker->scan_objects = deferred_split_scan; + shrinker_register(deferred_split_shrinker); + + return 0; +} + +static void __init thp_shrinker_exit(void) +{ + shrinker_free(huge_zero_page_shrinker); + shrinker_free(deferred_split_shrinker); +} + static int __init hugepage_init(void) { int err; @@ -482,12 +514,9 @@ static int __init hugepage_init(void) if (err) goto err_slab; - err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); - if (err) - goto err_hzp_shrinker; - err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split"); + err = thp_shrinker_init(); if (err) - goto err_split_shrinker; + goto err_shrinker; /* * By default disable transparent hugepages on smaller systems, @@ -505,10 +534,8 @@ static int __init hugepage_init(void) return 0; err_khugepaged: - unregister_shrinker(&deferred_split_shrinker); -err_split_shrinker: - unregister_shrinker(&huge_zero_page_shrinker); -err_hzp_shrinker: + thp_shrinker_exit(); +err_shrinker: khugepaged_destroy(); err_slab: hugepage_exit_sysfs(hugepage_kobj); @@ -2828,7 +2855,7 @@ void deferred_split_folio(struct folio *folio) #ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, folio_nid(folio), - deferred_split_shrinker.id); + deferred_split_shrinker->id); #endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); @@ -2902,14 +2929,6 @@ next: return split; } -static struct shrinker deferred_split_shrinker = { - .count_objects = deferred_split_count, - .scan_objects = deferred_split_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | - SHRINKER_NONSLAB, -}; - #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { -- cgit From 219c666eb2854bb9ca94d04a53ceb14aaf81cb28 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:18 +0800 Subject: mm: workingset: dynamically allocate the mm-shadow shrinker Use new APIs to dynamically allocate the mm-shadow shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-20-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/workingset.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..b192e44a0e7c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -763,13 +763,6 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, NULL); } -static struct shrinker workingset_shadow_shrinker = { - .count_objects = count_shadow_nodes, - .scan_objects = scan_shadow_nodes, - .seeks = 0, /* ->count reports only fully expendable nodes */ - .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, -}; - /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. @@ -778,9 +771,10 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; - int ret; + int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* @@ -797,17 +791,26 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow"); - if (ret) + workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | + SHRINKER_MEMCG_AWARE, + "mm-shadow"); + if (!workingset_shadow_shrinker) goto err; + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, - &workingset_shadow_shrinker); + workingset_shadow_shrinker); if (ret) goto err_list_lru; - register_shrinker_prepared(&workingset_shadow_shrinker); + + workingset_shadow_shrinker->count_objects = count_shadow_nodes; + workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; + /* ->count reports only fully expendable nodes */ + workingset_shadow_shrinker->seeks = 0; + + shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: - free_prealloced_shrinker(&workingset_shadow_shrinker); + shrinker_free(workingset_shadow_shrinker); err: return ret; } -- cgit From c19b548b493455cfb80895074687152869e77742 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:36 +0800 Subject: zsmalloc: dynamically allocate the mm-zspool shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the mm-zspool shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct zs_pool. Link: https://lkml.kernel.org/r/20230911094444.68966-38-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b58f957429f0..c743ce7a5f49 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -229,7 +229,7 @@ struct zs_pool { struct zs_pool_stats stats; /* Compact classes */ - struct shrinker shrinker; + struct shrinker *shrinker; #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; @@ -2086,8 +2086,7 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long pages_freed; - struct zs_pool *pool = container_of(shrinker, struct zs_pool, - shrinker); + struct zs_pool *pool = shrinker->private_data; /* * Compact classes and calculate compaction delta. @@ -2105,8 +2104,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, int i; struct size_class *class; unsigned long pages_to_free = 0; - struct zs_pool *pool = container_of(shrinker, struct zs_pool, - shrinker); + struct zs_pool *pool = shrinker->private_data; for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; @@ -2121,18 +2119,23 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, static void zs_unregister_shrinker(struct zs_pool *pool) { - unregister_shrinker(&pool->shrinker); + shrinker_free(pool->shrinker); } static int zs_register_shrinker(struct zs_pool *pool) { - pool->shrinker.scan_objects = zs_shrinker_scan; - pool->shrinker.count_objects = zs_shrinker_count; - pool->shrinker.batch = 0; - pool->shrinker.seeks = DEFAULT_SEEKS; + pool->shrinker = shrinker_alloc(0, "mm-zspool:%s", pool->name); + if (!pool->shrinker) + return -ENOMEM; + + pool->shrinker->scan_objects = zs_shrinker_scan; + pool->shrinker->count_objects = zs_shrinker_count; + pool->shrinker->batch = 0; + pool->shrinker->private_data = pool; - return register_shrinker(&pool->shrinker, "mm-zspool:%s", - pool->name); + shrinker_register(pool->shrinker); + + return 0; } static int calculate_zspage_chain_size(int class_size) -- cgit From f2383e01507eeee8a1c1283d61a117a97d6c4ebe Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:38 +0800 Subject: mm: shrinker: remove old APIs Now no users are using the old APIs, just remove them. Link: https://lkml.kernel.org/r/20230911094444.68966-40-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 143 ---------------------------------------------------------- 1 file changed, 143 deletions(-) (limited to 'mm') diff --git a/mm/shrinker.c b/mm/shrinker.c index d1032a4d5684..736fa67e8454 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -655,146 +655,3 @@ void shrinker_free(struct shrinker *shrinker) kfree(shrinker); } EXPORT_SYMBOL_GPL(shrinker_free); - -/* - * Add a shrinker callback to be called from the vm. - */ -static int __prealloc_shrinker(struct shrinker *shrinker) -{ - unsigned int size; - int err; - - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - err = prealloc_memcg_shrinker(shrinker); - if (err != -ENOSYS) - return err; - - shrinker->flags &= ~SHRINKER_MEMCG_AWARE; - } - - size = sizeof(*shrinker->nr_deferred); - if (shrinker->flags & SHRINKER_NUMA_AWARE) - size *= nr_node_ids; - - shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); - if (!shrinker->nr_deferred) - return -ENOMEM; - - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __prealloc_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - - return err; -} -#else -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __prealloc_shrinker(shrinker); -} -#endif - -void free_prealloced_shrinker(struct shrinker *shrinker) -{ -#ifdef CONFIG_SHRINKER_DEBUG - kfree_const(shrinker->name); - shrinker->name = NULL; -#endif - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - down_write(&shrinker_rwsem); - unregister_memcg_shrinker(shrinker); - up_write(&shrinker_rwsem); - return; - } - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} - -void register_shrinker_prepared(struct shrinker *shrinker) -{ - down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); - shrinker->flags |= SHRINKER_REGISTERED; - shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); -} - -static int __register_shrinker(struct shrinker *shrinker) -{ - int err = __prealloc_shrinker(shrinker); - - if (err) - return err; - register_shrinker_prepared(shrinker); - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __register_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - return err; -} -#else -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __register_shrinker(shrinker); -} -#endif -EXPORT_SYMBOL(register_shrinker); - -/* - * Remove one - */ -void unregister_shrinker(struct shrinker *shrinker) -{ - struct dentry *debugfs_entry; - int debugfs_id; - - if (!(shrinker->flags & SHRINKER_REGISTERED)) - return; - - down_write(&shrinker_rwsem); - list_del(&shrinker->list); - shrinker->flags &= ~SHRINKER_REGISTERED; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); - debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - up_write(&shrinker_rwsem); - - shrinker_debugfs_remove(debugfs_entry, debugfs_id); - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} -EXPORT_SYMBOL(unregister_shrinker); -- cgit From 307bececcd1205bcb67a3c0d53a69db237ccc9d4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:39 +0800 Subject: mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred} Currently, we maintain two linear arrays per node per memcg, which are shrinker_info::map and shrinker_info::nr_deferred. And we need to resize them when the shrinker_nr_max is exceeded, that is, allocate a new array, and then copy the old array to the new array, and finally free the old array by RCU. For shrinker_info::map, we do set_bit() under the RCU lock, so we may set the value into the old map which is about to be freed. This may cause the value set to be lost. The current solution is not to copy the old map when resizing, but to set all the corresponding bits in the new map to 1. This solves the data loss problem, but bring the overhead of more pointless loops while doing memcg slab shrink. For shrinker_info::nr_deferred, we will only modify it under the read lock of shrinker_rwsem, so it will not run concurrently with the resizing. But after we make memcg slab shrink lockless, there will be the same data loss problem as shrinker_info::map, and we can't work around it like the map. For such resizable arrays, the most straightforward idea is to change it to xarray, like we did for list_lru [1]. We need to do xa_store() in the list_lru_add()-->set_shrinker_bit(), but this will cause memory allocation, and the list_lru_add() doesn't accept failure. A possible solution is to pre-allocate, but the location of pre-allocation is not well determined (such as deferred_split_shrinker case). Therefore, this commit chooses to introduce the following secondary array for shrinker_info::{map, nr_deferred}: +---------------+--------+--------+-----+ | shrinker_info | unit 0 | unit 1 | ... | (secondary array) +---------------+--------+--------+-----+ | v +---------------+-----+ | nr_deferred[] | map | (leaf array) +---------------+-----+ (shrinker_info_unit) The leaf array is never freed unless the memcg is destroyed. The secondary array will be resized every time the shrinker id exceeds shrinker_nr_max. So the shrinker_info_unit can be indexed from both the old and the new shrinker_info->unit[x]. Then even if we get the old secondary array under the RCU lock, the found map and nr_deferred are also true, so the updated nr_deferred and map will not be lost. [1]. https://lore.kernel.org/all/20220228122126.37293-13-songmuchun@bytedance.com/ [zhengqi.arch@bytedance.com: unlock the &shrinker_rwsem before the call to free_shrinker_info()] Link: https://lkml.kernel.org/r/20230928141517.12164-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911094444.68966-41-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 250 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 154 insertions(+), 96 deletions(-) (limited to 'mm') diff --git a/mm/shrinker.c b/mm/shrinker.c index 736fa67e8454..e9644cda80b5 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -12,15 +12,50 @@ DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; -/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ -static inline int shrinker_map_size(int nr_items) +static inline int shrinker_unit_size(int nr_items) { - return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); + return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *)); } -static inline int shrinker_defer_size(int nr_items) +static inline void shrinker_unit_free(struct shrinker_info *info, int start) { - return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); + struct shrinker_info_unit **unit; + int nr, i; + + if (!info) + return; + + unit = info->unit; + nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS); + + for (i = start; i < nr; i++) { + if (!unit[i]) + break; + + kfree(unit[i]); + unit[i] = NULL; + } +} + +static inline int shrinker_unit_alloc(struct shrinker_info *new, + struct shrinker_info *old, int nid) +{ + struct shrinker_info_unit *unit; + int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS); + int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0; + int i; + + for (i = start; i < nr; i++) { + unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid); + if (!unit) { + shrinker_unit_free(new, start); + return -ENOMEM; + } + + new->unit[i] = unit; + } + + return 0; } void free_shrinker_info(struct mem_cgroup *memcg) @@ -32,6 +67,7 @@ void free_shrinker_info(struct mem_cgroup *memcg) for_each_node(nid) { pn = memcg->nodeinfo[nid]; info = rcu_dereference_protected(pn->shrinker_info, true); + shrinker_unit_free(info, 0); kvfree(info); rcu_assign_pointer(pn->shrinker_info, NULL); } @@ -40,28 +76,28 @@ void free_shrinker_info(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg) { struct shrinker_info *info; - int nid, size, ret = 0; - int map_size, defer_size = 0; + int nid, ret = 0; + int array_size = 0; down_write(&shrinker_rwsem); - map_size = shrinker_map_size(shrinker_nr_max); - defer_size = shrinker_defer_size(shrinker_nr_max); - size = map_size + defer_size; + array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); - if (!info) { - free_shrinker_info(memcg); - ret = -ENOMEM; - break; - } - info->nr_deferred = (atomic_long_t *)(info + 1); - info->map = (void *)info->nr_deferred + defer_size; + info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); + if (!info) + goto err; info->map_nr_max = shrinker_nr_max; + if (shrinker_unit_alloc(info, NULL, nid)) + goto err; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } up_write(&shrinker_rwsem); return ret; + +err: + up_write(&shrinker_rwsem); + free_shrinker_info(memcg); + return -ENOMEM; } static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, @@ -71,15 +107,12 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, lockdep_is_held(&shrinker_rwsem)); } -static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int map_size, int defer_size, - int old_map_size, int old_defer_size, - int new_nr_max) +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, + int old_size, int new_nr_max) { struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; - int size = map_size + defer_size; for_each_node(nid) { pn = memcg->nodeinfo[nid]; @@ -92,21 +125,17 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, if (new_nr_max <= old->map_nr_max) continue; - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + new = kvmalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid); if (!new) return -ENOMEM; - new->nr_deferred = (atomic_long_t *)(new + 1); - new->map = (void *)new->nr_deferred + defer_size; new->map_nr_max = new_nr_max; - /* map: set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_map_size); - memset((void *)new->map + old_map_size, 0, map_size - old_map_size); - /* nr_deferred: copy old values, clear all new values */ - memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); - memset((void *)new->nr_deferred + old_defer_size, 0, - defer_size - old_defer_size); + memcpy(new->unit, old->unit, old_size); + if (shrinker_unit_alloc(new, old, nid)) { + kvfree(new); + return -ENOMEM; + } rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); @@ -118,9 +147,8 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, static int expand_shrinker_info(int new_id) { int ret = 0; - int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); - int map_size, defer_size = 0; - int old_map_size, old_defer_size = 0; + int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS); + int new_size, old_size = 0; struct mem_cgroup *memcg; if (!root_mem_cgroup) @@ -128,15 +156,12 @@ static int expand_shrinker_info(int new_id) lockdep_assert_held(&shrinker_rwsem); - map_size = shrinker_map_size(new_nr_max); - defer_size = shrinker_defer_size(new_nr_max); - old_map_size = shrinker_map_size(shrinker_nr_max); - old_defer_size = shrinker_defer_size(shrinker_nr_max); + new_size = shrinker_unit_size(new_nr_max); + old_size = shrinker_unit_size(shrinker_nr_max); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - ret = expand_one_shrinker_info(memcg, map_size, defer_size, - old_map_size, old_defer_size, + ret = expand_one_shrinker_info(memcg, new_size, old_size, new_nr_max); if (ret) { mem_cgroup_iter_break(NULL, memcg); @@ -150,17 +175,34 @@ out: return ret; } +static inline int shrinker_id_to_index(int shrinker_id) +{ + return shrinker_id / SHRINKER_UNIT_BITS; +} + +static inline int shrinker_id_to_offset(int shrinker_id) +{ + return shrinker_id % SHRINKER_UNIT_BITS; +} + +static inline int calc_shrinker_id(int index, int offset) +{ + return index * SHRINKER_UNIT_BITS + offset; +} + void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; + struct shrinker_info_unit *unit; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + unit = info->unit[shrinker_id_to_index(shrinker_id)]; if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); - set_bit(shrinker_id, info->map); + set_bit(shrinker_id_to_offset(shrinker_id), unit->map); } rcu_read_unlock(); } @@ -209,26 +251,31 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; + struct shrinker_info_unit *unit; info = shrinker_info_protected(memcg, nid); - return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); + unit = info->unit[shrinker_id_to_index(shrinker->id)]; + return atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; + struct shrinker_info_unit *unit; info = shrinker_info_protected(memcg, nid); - return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); + unit = info->unit[shrinker_id_to_index(shrinker->id)]; + return atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); } void reparent_shrinker_deferred(struct mem_cgroup *memcg) { - int i, nid; + int nid, index, offset; long nr; struct mem_cgroup *parent; struct shrinker_info *child_info, *parent_info; + struct shrinker_info_unit *child_unit, *parent_unit; parent = parent_mem_cgroup(memcg); if (!parent) @@ -239,9 +286,13 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); - for (i = 0; i < child_info->map_nr_max; i++) { - nr = atomic_long_read(&child_info->nr_deferred[i]); - atomic_long_add(nr, &parent_info->nr_deferred[i]); + for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) { + child_unit = child_info->unit[index]; + parent_unit = parent_info->unit[index]; + for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) { + nr = atomic_long_read(&child_unit->nr_deferred[offset]); + atomic_long_add(nr, &parent_unit->nr_deferred[offset]); + } } } up_read(&shrinker_rwsem); @@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int i; + int offset, index = 0; if (!mem_cgroup_online(memcg)) return 0; @@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (unlikely(!info)) goto unlock; - for_each_set_bit(i, info->map, info->map_nr_max) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - struct shrinker *shrinker; + for (; index < shrinker_id_to_index(info->map_nr_max); index++) { + struct shrinker_info_unit *unit; - shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { - if (!shrinker) - clear_bit(i, info->map); - continue; - } + unit = info->unit[index]; - /* Call non-slab shrinkers even though kmem is disabled */ - if (!memcg_kmem_online() && - !(shrinker->flags & SHRINKER_NONSLAB)) - continue; + for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + int shrinker_id = calc_shrinker_id(index, offset); - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) { - clear_bit(i, info->map); - /* - * After the shrinker reported that it had no objects to - * free, but before we cleared the corresponding bit in - * the memcg shrinker map, a new object might have been - * added. To make sure, we have the bit set in this - * case, we invoke the shrinker one more time and reset - * the bit if it reports that it is not empty anymore. - * The memory barrier here pairs with the barrier in - * set_shrinker_bit(): - * - * list_lru_add() shrink_slab_memcg() - * list_add_tail() clear_bit() - * - * set_bit() do_shrink_slab() - */ - smp_mb__after_atomic(); - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - else - set_shrinker_bit(memcg, nid, i); - } - freed += ret; + shrinker = idr_find(&shrinker_idr, shrinker_id); + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { + if (!shrinker) + clear_bit(offset, unit->map); + continue; + } - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_online() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) { + clear_bit(offset, unit->map); + /* + * After the shrinker reported that it had no objects to + * free, but before we cleared the corresponding bit in + * the memcg shrinker map, a new object might have been + * added. To make sure, we have the bit set in this + * case, we invoke the shrinker one more time and reset + * the bit if it reports that it is not empty anymore. + * The memory barrier here pairs with the barrier in + * set_shrinker_bit(): + * + * list_lru_add() shrink_slab_memcg() + * list_add_tail() clear_bit() + * + * set_bit() do_shrink_slab() + */ + smp_mb__after_atomic(); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + else + set_shrinker_bit(memcg, nid, shrinker_id); + } + freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + goto unlock; + } } } unlock: -- cgit From 48a7a0996a0089f4161ad437a810a10596e8d278 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:40 +0800 Subject: mm: shrinker: rename {prealloc|unregister}_memcg_shrinker() to shrinker_memcg_{alloc|remove}() With the new shrinker APIs, there is no action such as prealloc, so rename {prealloc|unregister}_memcg_shrinker() to shrinker_memcg_{alloc|remove}(), which corresponds to the idr_{alloc|remove}() inside the function. Link: https://lkml.kernel.org/r/20230911094444.68966-42-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/shrinker.c b/mm/shrinker.c index e9644cda80b5..e2e832b15803 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -210,7 +210,7 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) static DEFINE_IDR(shrinker_idr); -static int prealloc_memcg_shrinker(struct shrinker *shrinker) +static int shrinker_memcg_alloc(struct shrinker *shrinker) { int id, ret = -ENOMEM; @@ -236,7 +236,7 @@ unlock: return ret; } -static void unregister_memcg_shrinker(struct shrinker *shrinker) +static void shrinker_memcg_remove(struct shrinker *shrinker) { int id = shrinker->id; @@ -298,12 +298,12 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) up_read(&shrinker_rwsem); } #else -static int prealloc_memcg_shrinker(struct shrinker *shrinker) +static int shrinker_memcg_alloc(struct shrinker *shrinker) { return -ENOSYS; } -static void unregister_memcg_shrinker(struct shrinker *shrinker) +static void shrinker_memcg_remove(struct shrinker *shrinker) { } @@ -629,7 +629,7 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) shrinker->seeks = DEFAULT_SEEKS; if (flags & SHRINKER_MEMCG_AWARE) { - err = prealloc_memcg_shrinker(shrinker); + err = shrinker_memcg_alloc(shrinker); if (err == -ENOSYS) { /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ shrinker->flags &= ~SHRINKER_MEMCG_AWARE; @@ -701,7 +701,7 @@ void shrinker_free(struct shrinker *shrinker) shrinker_debugfs_name_free(shrinker); if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); + shrinker_memcg_remove(shrinker); up_write(&shrinker_rwsem); if (debugfs_entry) -- cgit From ca1d36b823944f24b5755311e95883fb5fdb807b Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:41 +0800 Subject: mm: shrinker: make global slab shrink lockless The shrinker_rwsem is a global read-write lock in shrinkers subsystem, which protects most operations such as slab shrink, registration and unregistration of shrinkers, etc. This can easily cause problems in the following cases. 1) When the memory pressure is high and there are many filesystems mounted or unmounted at the same time, slab shrink will be affected (down_read_trylock() failed). Such as the real workload mentioned by Kirill Tkhai: ``` One of the real workloads from my experience is start of an overcommitted node containing many starting containers after node crash (or many resuming containers after reboot for kernel update). In these cases memory pressure is huge, and the node goes round in long reclaim. ``` 2) If a shrinker is blocked (such as the case mentioned in [1]) and a writer comes in (such as mount a fs), then this writer will be blocked and cause all subsequent shrinker-related operations to be blocked. Even if there is no competitor when shrinking slab, there may still be a problem. The down_read_trylock() may become a perf hotspot with frequent calls to shrink_slab(). Because of the poor multicore scalability of atomic operations, this can lead to a significant drop in IPC (instructions per cycle). We used to implement the lockless slab shrink with SRCU [2], but then kernel test robot reported -88.8% regression in stress-ng.ramfs.ops_per_sec test case [3], so we reverted it [4]. This commit uses the refcount+RCU method [5] proposed by Dave Chinner to re-implement the lockless global slab shrink. The memcg slab shrink is handled in the subsequent patch. For now, all shrinker instances are converted to dynamically allocated and will be freed by call_rcu(). So we can use rcu_read_{lock,unlock}() to ensure that the shrinker instance is valid. And the shrinker instance will not be run again after unregistration. So the structure that records the pointer of shrinker instance can be safely freed without waiting for the RCU read-side critical section. In this way, while we implement the lockless slab shrink, we don't need to be blocked in unregister_shrinker(). The following are the test results: stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 & 1) Before applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 473062 60.00 8.00 279.13 7884.12 1647.59 for a 60.01s run time: 1440.34s available CPU time 7.99s user time ( 0.55%) 279.13s system time ( 19.38%) 287.12s total time ( 19.93%) load average: 7.12 2.99 1.15 successful run completed in 60.01s (1 min, 0.01 secs) 2) After applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 477165 60.00 8.13 281.34 7952.55 1648.40 for a 60.01s run time: 1440.33s available CPU time 8.12s user time ( 0.56%) 281.34s system time ( 19.53%) 289.46s total time ( 20.10%) load average: 6.98 3.03 1.19 successful run completed in 60.01s (1 min, 0.01 secs) We can see that the ops/s has hardly changed. [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/ [2]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/ [3]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [4]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/ [5]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230911094444.68966-43-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 89 +++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/shrinker.c b/mm/shrinker.c index e2e832b15803..9bb2e61092b3 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "internal.h" @@ -577,33 +578,50 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - if (!down_read_trylock(&shrinker_rwsem)) - goto out; - - list_for_each_entry(shrinker, &shrinker_list, list) { + /* + * lockless algorithm of global shrink. + * + * In the unregistration setp, the shrinker will be freed asynchronously + * via RCU after its refcount reaches 0. So both rcu_read_lock() and + * shrinker_try_get() can be used to ensure the existence of the shrinker. + * + * So in the global shrink: + * step 1: use rcu_read_lock() to guarantee existence of the shrinker + * and the validity of the shrinker_list walk. + * step 2: use shrinker_try_get() to try get the refcount, if successful, + * then the existence of the shrinker can also be guaranteed, + * so we can release the RCU lock to do do_shrink_slab() that + * may sleep. + * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(), + * which ensures that neither this shrinker nor the next shrinker + * will be freed in the next traversal operation. + * step 4: do shrinker_put() paired with step 2 to put the refcount, + * if the refcount reaches 0, then wake up the waiter in + * shrinker_free() by calling complete(). + */ + rcu_read_lock(); + list_for_each_entry_rcu(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; + if (!shrinker_try_get(shrinker)) + continue; + + rcu_read_unlock(); + ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - /* - * Bail out if someone want to register a new shrinker to - * prevent the registration from being stalled for long periods - * by parallel ongoing shrinking. - */ - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } + + rcu_read_lock(); + shrinker_put(shrinker); } - up_read(&shrinker_rwsem); -out: + rcu_read_unlock(); cond_resched(); return freed; } @@ -676,13 +694,29 @@ void shrinker_register(struct shrinker *shrinker) } down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); + list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); up_write(&shrinker_rwsem); + + init_completion(&shrinker->done); + /* + * Now the shrinker is fully set up, take the first reference to it to + * indicate that lookup operations are now allowed to use it via + * shrinker_try_get(). + */ + refcount_set(&shrinker->refcount, 1); } EXPORT_SYMBOL_GPL(shrinker_register); +static void shrinker_free_rcu_cb(struct rcu_head *head) +{ + struct shrinker *shrinker = container_of(head, struct shrinker, rcu); + + kfree(shrinker->nr_deferred); + kfree(shrinker); +} + void shrinker_free(struct shrinker *shrinker) { struct dentry *debugfs_entry = NULL; @@ -691,9 +725,25 @@ void shrinker_free(struct shrinker *shrinker) if (!shrinker) return; + if (shrinker->flags & SHRINKER_REGISTERED) { + /* drop the initial refcount */ + shrinker_put(shrinker); + /* + * Wait for all lookups of the shrinker to complete, after that, + * no shrinker is running or will run again, then we can safely + * free it asynchronously via RCU and safely free the structure + * where the shrinker is located, such as super_block etc. + */ + wait_for_completion(&shrinker->done); + } + down_write(&shrinker_rwsem); if (shrinker->flags & SHRINKER_REGISTERED) { - list_del(&shrinker->list); + /* + * Now we can safely remove it from the shrinker_list and then + * free it. + */ + list_del_rcu(&shrinker->list); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); shrinker->flags &= ~SHRINKER_REGISTERED; } @@ -707,9 +757,6 @@ void shrinker_free(struct shrinker *shrinker) if (debugfs_entry) shrinker_debugfs_remove(debugfs_entry, debugfs_id); - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; - - kfree(shrinker); + call_rcu(&shrinker->rcu, shrinker_free_rcu_cb); } EXPORT_SYMBOL_GPL(shrinker_free); -- cgit From 50d09da8e1199092a128eebd8a986d1fb0335fe4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:42 +0800 Subject: mm: shrinker: make memcg slab shrink lockless Like global slab shrink, this commit also uses refcount+RCU method to make memcg slab shrink lockless. Use the following script to do slab shrink stress test: ``` DIR="/root/shrinker/memcg/mnt" do_create() { mkdir -p /sys/fs/cgroup/memory/test echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes for i in `seq 0 $1`; do mkdir -p /sys/fs/cgroup/memory/test/$i; echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; mkdir -p $DIR/$i; done } do_mount() { for i in `seq $1 $2`; do mount -t tmpfs $i $DIR/$i; done } do_touch() { for i in `seq $1 $2`; do echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & done } case "$1" in touch) do_touch $2 $3 ;; test) do_create 4000 do_mount 0 4000 do_touch 0 3000 ;; *) exit 1 ;; esac ``` Save the above script, then run test and touch commands. Then we can use the following perf command to view hotspots: perf top -U -F 999 1) Before applying this patchset: 33.15% [kernel] [k] down_read_trylock 25.38% [kernel] [k] shrink_slab 21.75% [kernel] [k] up_read 4.45% [kernel] [k] _find_next_bit 2.27% [kernel] [k] do_shrink_slab 1.80% [kernel] [k] intel_idle_irq 1.79% [kernel] [k] shrink_lruvec 0.67% [kernel] [k] xas_descend 0.41% [kernel] [k] mem_cgroup_iter 0.40% [kernel] [k] shrink_node 0.38% [kernel] [k] list_lru_count_one 2) After applying this patchset: 64.56% [kernel] [k] shrink_slab 12.18% [kernel] [k] do_shrink_slab 3.30% [kernel] [k] __rcu_read_unlock 2.61% [kernel] [k] shrink_lruvec 2.49% [kernel] [k] __rcu_read_lock 1.93% [kernel] [k] intel_idle_irq 0.89% [kernel] [k] shrink_node 0.81% [kernel] [k] mem_cgroup_iter 0.77% [kernel] [k] mem_cgroup_calculate_protection 0.66% [kernel] [k] list_lru_count_one We can see that the first perf hotspot becomes shrink_slab, which is what we expect. Link: https://lkml.kernel.org/r/20230911094444.68966-44-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/shrinker.c b/mm/shrinker.c index 9bb2e61092b3..d1b34a79c7a7 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -219,7 +219,6 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) return -ENOSYS; down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -253,10 +252,15 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; struct shrinker_info_unit *unit; + long nr_deferred; - info = shrinker_info_protected(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker->id)]; - return atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); + nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); + rcu_read_unlock(); + + return nr_deferred; } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, @@ -264,10 +268,16 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; struct shrinker_info_unit *unit; + long nr_deferred; - info = shrinker_info_protected(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker->id)]; - return atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); + nr_deferred = + atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); + rcu_read_unlock(); + + return nr_deferred; } void reparent_shrinker_deferred(struct mem_cgroup *memcg) @@ -464,18 +474,54 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!mem_cgroup_online(memcg)) return 0; - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); + /* + * lockless algorithm of memcg shrink. + * + * The shrinker_info may be freed asynchronously via RCU in the + * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used + * to ensure the existence of the shrinker_info. + * + * The shrinker_info_unit is never freed unless its corresponding memcg + * is destroyed. Here we already hold the refcount of memcg, so the + * memcg will not be destroyed, and of course shrinker_info_unit will + * not be freed. + * + * So in the memcg shrink: + * step 1: use rcu_read_lock() to guarantee existence of the + * shrinker_info. + * step 2: after getting shrinker_info_unit we can safely release the + * RCU lock. + * step 3: traverse the bitmap and calculate shrinker_id + * step 4: use rcu_read_lock() to guarantee existence of the shrinker. + * step 5: use shrinker_id to find the shrinker, then use + * shrinker_try_get() to guarantee existence of the shrinker, + * then we can release the RCU lock to do do_shrink_slab() that + * may sleep. + * step 6: do shrinker_put() paired with step 5 to put the refcount, + * if the refcount reaches 0, then wake up the waiter in + * shrinker_free() by calling complete(). + * Note: here is different from the global shrink, we don't + * need to acquire the RCU lock to guarantee existence of + * the shrinker, because we don't need to use this + * shrinker to traverse the next shrinker in the bitmap. + * step 7: we have already exited the read-side of rcu critical section + * before calling do_shrink_slab(), the shrinker_info may be + * released in expand_one_shrinker_info(), so go back to step 1 + * to reacquire the shrinker_info. + */ +again: + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (unlikely(!info)) goto unlock; - for (; index < shrinker_id_to_index(info->map_nr_max); index++) { + if (index < shrinker_id_to_index(info->map_nr_max)) { struct shrinker_info_unit *unit; unit = info->unit[index]; + rcu_read_unlock(); + for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { struct shrink_control sc = { .gfp_mask = gfp_mask, @@ -485,12 +531,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct shrinker *shrinker; int shrinker_id = calc_shrinker_id(index, offset); + rcu_read_lock(); shrinker = idr_find(&shrinker_idr, shrinker_id); - if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { - if (!shrinker) - clear_bit(offset, unit->map); + if (unlikely(!shrinker || !shrinker_try_get(shrinker))) { + clear_bit(offset, unit->map); + rcu_read_unlock(); continue; } + rcu_read_unlock(); /* Call non-slab shrinkers even though kmem is disabled */ if (!memcg_kmem_online() && @@ -523,15 +571,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, set_shrinker_bit(memcg, nid, shrinker_id); } freed += ret; - - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - goto unlock; - } + shrinker_put(shrinker); } + + index++; + goto again; } unlock: - up_read(&shrinker_rwsem); + rcu_read_unlock(); return freed; } #else /* !CONFIG_MEMCG */ -- cgit From 604b8b655020816b66ae474681d1100c5c0ba8c4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:43 +0800 Subject: mm: shrinker: hold write lock to reparent shrinker nr_deferred For now, reparent_shrinker_deferred() is the only holder of read lock of shrinker_rwsem. And it already holds the global cgroup_mutex, so it will not be called in parallel. Therefore, in order to convert shrinker_rwsem to shrinker_mutex later, here we change to hold the write lock of shrinker_rwsem to reparent. Link: https://lkml.kernel.org/r/20230911094444.68966-45-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shrinker.c b/mm/shrinker.c index d1b34a79c7a7..ef22f20a97cb 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -293,7 +293,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_read(&shrinker_rwsem); + down_write(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -306,7 +306,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) } } } - up_read(&shrinker_rwsem); + up_write(&shrinker_rwsem); } #else static int shrinker_memcg_alloc(struct shrinker *shrinker) -- cgit From 8a0e8bb112af28362514624fc46e20426b4bdf1f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:44 +0800 Subject: mm: shrinker: convert shrinker_rwsem to mutex Now there are no readers of shrinker_rwsem, so we can simply replace it with mutex lock. [akpm@linux-foundation.org: update the fix to alloc_shrinker_info()] Link: https://lkml.kernel.org/r/20230911094444.68966-46-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 30 +++++++++++++++--------------- mm/shrinker_debug.c | 14 +++++++------- 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/shrinker.c b/mm/shrinker.c index ef22f20a97cb..dd91eab43ed3 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -8,7 +8,7 @@ #include "internal.h" LIST_HEAD(shrinker_list); -DECLARE_RWSEM(shrinker_rwsem); +DEFINE_MUTEX(shrinker_mutex); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -80,7 +80,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, ret = 0; int array_size = 0; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); @@ -91,12 +91,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) goto err; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; err: - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); free_shrinker_info(memcg); return -ENOMEM; } @@ -105,7 +105,7 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - lockdep_is_held(&shrinker_rwsem)); + lockdep_is_held(&shrinker_mutex)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, @@ -155,7 +155,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); new_size = shrinker_unit_size(new_nr_max); old_size = shrinker_unit_size(shrinker_nr_max); @@ -218,7 +218,7 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -232,7 +232,7 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; } @@ -242,7 +242,7 @@ static void shrinker_memcg_remove(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); idr_remove(&shrinker_idr, id); } @@ -293,7 +293,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -306,7 +306,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) } } } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); } #else static int shrinker_memcg_alloc(struct shrinker *shrinker) @@ -740,11 +740,11 @@ void shrinker_register(struct shrinker *shrinker) return; } - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); init_completion(&shrinker->done); /* @@ -784,7 +784,7 @@ void shrinker_free(struct shrinker *shrinker) wait_for_completion(&shrinker->done); } - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); if (shrinker->flags & SHRINKER_REGISTERED) { /* * Now we can safely remove it from the shrinker_list and then @@ -799,7 +799,7 @@ void shrinker_free(struct shrinker *shrinker) if (shrinker->flags & SHRINKER_MEMCG_AWARE) shrinker_memcg_remove(shrinker); - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); if (debugfs_entry) shrinker_debugfs_remove(debugfs_entry, debugfs_id); diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 24aebe7c24cc..12ea5486a3e9 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -9,7 +9,7 @@ #include "internal.h" /* defined in vmscan.c */ -extern struct rw_semaphore shrinker_rwsem; +extern struct mutex shrinker_mutex; extern struct list_head shrinker_list; static DEFINE_IDA(shrinker_debugfs_ida); @@ -165,7 +165,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) char buf[128]; int id; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); /* debugfs isn't initialized yet, add debugfs entries later. */ if (!shrinker_debugfs_root) @@ -208,7 +208,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) if (!new) return -ENOMEM; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); old = shrinker->name; shrinker->name = new; @@ -226,7 +226,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) shrinker->debugfs_entry = entry; } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); kfree_const(old); @@ -239,7 +239,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, { struct dentry *entry = shrinker->debugfs_entry; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); *debugfs_id = entry ? shrinker->debugfs_id : -1; shrinker->debugfs_entry = NULL; @@ -265,14 +265,14 @@ static int __init shrinker_debugfs_init(void) shrinker_debugfs_root = dentry; /* Create debugfs entries for shrinkers registered at boot */ - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); list_for_each_entry(shrinker, &shrinker_list, list) if (!shrinker->debugfs_entry) { ret = shrinker_debugfs_add(shrinker); if (ret) break; } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; } -- cgit From ed547ab6f4041c2186df672029eb17f98f44d125 Mon Sep 17 00:00:00 2001 From: liujinlong Date: Tue, 12 Sep 2023 16:59:23 +0800 Subject: mm: vmscan: modify an easily misunderstood function name When looking at the code in the memory part, I found that the purpose of the function prepare_scan_countis very different from the function name. It is easy to misunderstand when reading.The function prepare_scan_count mainly completes the assignment of the scan_control structure.Therefore, I suggest that the function name can be changed to prepare_scan_control, which is easier to understand. Link: https://lkml.kernel.org/r/20230912085923.27238-1-liujinlong@kylinos.cn Signed-off-by: liujinlong Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 4a6551510b65..8a3f83e0231e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2208,7 +2208,7 @@ enum scan_balance { SCAN_FILE, }; -static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) +static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) { unsigned long file; struct lruvec *target_lruvec; @@ -5834,7 +5834,7 @@ again: nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - prepare_scan_count(pgdat, sc); + prepare_scan_control(pgdat, sc); shrink_node_memcgs(pgdat, sc); -- cgit From 811244a501b967b00fecb1ae906d5dc6329c91e0 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 14 Sep 2023 00:49:37 +0800 Subject: mm: memcg: add THP swap out info for anonymous reclaim At present, we support per-memcg reclaim strategy, however we do not know the number of transparent huge pages being reclaimed, as we know the transparent huge pages need to be splited before reclaim them, and they will bring some performance bottleneck effect. for example, when two memcg (A & B) are doing reclaim for anonymous pages at same time, and 'A' memcg is reclaiming a large number of transparent huge pages, we can better analyze that the performance bottleneck will be caused by 'A' memcg. therefore, in order to better analyze such problems, there add THP swap out info for per-memcg. [akpm@linux-foundation.orgL fix swap_writepage_fs(), per Johannes] Link: https://lkml.kernel.org/r/20230913213343.GB48476@cmpxchg.org Link: https://lkml.kernel.org/r/20230913164938.16918-1-vernhao@tencent.com Signed-off-by: Xin Hao Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 ++ mm/page_io.c | 8 ++++---- mm/vmscan.c | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5b009b233ab8..68313331971c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -704,6 +704,8 @@ static const unsigned int memcg_vm_event_stat[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE THP_FAULT_ALLOC, THP_COLLAPSE_ALLOC, + THP_SWPOUT, + THP_SWPOUT_FALLBACK, #endif }; diff --git a/mm/page_io.c b/mm/page_io.c index fe4c21af23f2..cb559ae324c6 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -208,8 +208,10 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) static inline void count_swpout_vm_event(struct folio *folio) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (unlikely(folio_test_pmd_mappable(folio))) + if (unlikely(folio_test_pmd_mappable(folio))) { + count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); + } #endif count_vm_events(PSWPOUT, folio_nr_pages(folio)); } @@ -278,9 +280,6 @@ static void sio_write_complete(struct kiocb *iocb, long ret) set_page_dirty(page); ClearPageReclaim(page); } - } else { - for (p = 0; p < sio->pages; p++) - count_swpout_vm_event(page_folio(sio->bvec[p].bv_page)); } for (p = 0; p < sio->pages; p++) @@ -296,6 +295,7 @@ static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) struct file *swap_file = sis->swap_file; loff_t pos = page_file_offset(page); + count_swpout_vm_event(page_folio(page)); set_page_writeback(page); unlock_page(page); if (wbc->swap_plug) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8a3f83e0231e..acf115468bf8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1214,6 +1214,7 @@ retry: folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); #endif if (!add_to_swap(folio)) -- cgit From fd63908706f79c963946a77b7f352db5431deed5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:08 +0200 Subject: mm/rmap: drop stale comment in page_add_anon_rmap and hugepage_add_anon_rmap() Patch series "Anon rmap cleanups". Some cleanups around rmap for anon pages. I'm working on more cleanups also around file rmap -- also to handle the "compound" parameter internally only and to let hugetlb use page_add_file_rmap(), but these changes make sense separately. This patch (of 6): That comment was added in commit 5dbe0af47f8a ("mm: fix kernel BUG at mm/rmap.c:1017!") to document why we can see vma->vm_end getting adjusted concurrently due to a VMA split. However, the optimized locking code was changed again in bf181b9f9d8 ("mm anon rmap: replace same_anon_vma linked list with an interval tree."). ... and later, the comment was changed in commit 0503ea8f5ba7 ("mm/mmap: remove __vma_adjust()") to talk about "vma_merge" although the original issue was with VMA splitting. Let's just remove that comment. Nowadays, it's outdated, imprecise and confusing. Link: https://lkml.kernel.org/r/20230913125113.313322-1-david@redhat.com Link: https://lkml.kernel.org/r/20230913125113.313322-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/rmap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 9f795b93cf40..61ad50b9ed9f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1245,7 +1245,6 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); if (likely(!folio_test_ksm(folio))) { - /* address might be in next vma when migration races vma_merge */ if (first) __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); @@ -2549,7 +2548,6 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, BUG_ON(!folio_test_locked(folio)); BUG_ON(!anon_vma); - /* address might be in next vma when migration races vma_merge */ first = atomic_inc_and_test(&folio->_entire_mapcount); VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); -- cgit From c66db8c0702c0ab741ecfd5e12b323ff49fe9089 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:09 +0200 Subject: mm/rmap: move SetPageAnonExclusive out of __page_set_anon_rmap() Let's handle it in the caller. No need to pass the page. While at it, rename the function to __folio_set_anon() and pass "bool exclusive" instead of "int exclusive". Link: https://lkml.kernel.org/r/20230913125113.313322-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 61ad50b9ed9f..f76f0f23725c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1122,27 +1122,25 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) } /** - * __page_set_anon_rmap - set up new anonymous rmap - * @folio: Folio which contains page. - * @page: Page to add to rmap. - * @vma: VM area to add page to. + * __folio_set_anon - set up a new anonymous rmap for a folio + * @folio: The folio to set up the new anonymous rmap for. + * @vma: VM area to add the folio to. * @address: User virtual address of the mapping - * @exclusive: the page is exclusively owned by the current process + * @exclusive: Whether the folio is exclusive to the process. */ -static void __page_set_anon_rmap(struct folio *folio, struct page *page, - struct vm_area_struct *vma, unsigned long address, int exclusive) +static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, + unsigned long address, bool exclusive) { struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma); if (folio_test_anon(folio)) - goto out; + return; /* - * If the page isn't exclusively mapped into this vma, - * we must use the _oldest_ possible anon_vma for the - * page mapping! + * If the folio isn't exclusive to this vma, we must use the _oldest_ + * possible anon_vma for the folio mapping! */ if (!exclusive) anon_vma = anon_vma->root; @@ -1156,9 +1154,6 @@ static void __page_set_anon_rmap(struct folio *folio, struct page *page, anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); folio->index = linear_page_index(vma, address); -out: - if (exclusive) - SetPageAnonExclusive(page); } /** @@ -1246,11 +1241,13 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, if (likely(!folio_test_ksm(folio))) { if (first) - __page_set_anon_rmap(folio, page, vma, address, - !!(flags & RMAP_EXCLUSIVE)); + __folio_set_anon(folio, vma, address, + !!(flags & RMAP_EXCLUSIVE)); else __page_check_anon_rmap(folio, page, vma, address); } + if (flags & RMAP_EXCLUSIVE) + SetPageAnonExclusive(page); mlock_vma_folio(folio, vma, compound); } @@ -1289,7 +1286,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, } __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); - __page_set_anon_rmap(folio, &folio->page, vma, address, 1); + __folio_set_anon(folio, vma, address, true); + SetPageAnonExclusive(&folio->page); } /** @@ -2552,8 +2550,10 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) - __page_set_anon_rmap(folio, page, vma, address, - !!(flags & RMAP_EXCLUSIVE)); + __folio_set_anon(folio, vma, address, + !!(flags & RMAP_EXCLUSIVE)); + if (flags & RMAP_EXCLUSIVE) + SetPageAnonExclusive(page); } void hugepage_add_new_anon_rmap(struct folio *folio, @@ -2563,6 +2563,7 @@ void hugepage_add_new_anon_rmap(struct folio *folio, /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); - __page_set_anon_rmap(folio, &folio->page, vma, address, 1); + __folio_set_anon(folio, vma, address, true); + SetPageAnonExclusive(&folio->page); } #endif /* CONFIG_HUGETLB_PAGE */ -- cgit From c5c540034747dfe450f64d1151081a6080daa8f9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:10 +0200 Subject: mm/rmap: move folio_test_anon() check out of __folio_set_anon() Let's handle it in the caller; no need for the "first" check based on the mapcount. We really only end up with !anon pages in page_add_anon_rmap() via do_swap_page(), where we hold the folio lock. So races are not possible. Add a VM_WARN_ON_FOLIO() to make sure that we really hold the folio lock. In the future, we might want to let do_swap_page() use folio_add_new_anon_rmap() on new pages instead: however, we might have to pass then whether the folio is exclusive or not. So keep it in there for now. For hugetlb we never expect to have a non-anon page in hugepage_add_anon_rmap(). Remove that code, along with some other checks that are either not required or were checked in hugepage_add_new_anon_rmap() already. Link: https://lkml.kernel.org/r/20230913125113.313322-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index f76f0f23725c..65de3832123e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1135,9 +1135,6 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, BUG_ON(!anon_vma); - if (folio_test_anon(folio)) - return; - /* * If the folio isn't exclusive to this vma, we must use the _oldest_ * possible anon_vma for the folio mapping! @@ -1239,12 +1236,12 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, if (nr) __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); - if (likely(!folio_test_ksm(folio))) { - if (first) - __folio_set_anon(folio, vma, address, - !!(flags & RMAP_EXCLUSIVE)); - else - __page_check_anon_rmap(folio, page, vma, address); + if (unlikely(!folio_test_anon(folio))) { + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + __folio_set_anon(folio, vma, address, + !!(flags & RMAP_EXCLUSIVE)); + } else if (likely(!folio_test_ksm(folio))) { + __page_check_anon_rmap(folio, page, vma, address); } if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(page); @@ -2541,17 +2538,13 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { struct folio *folio = page_folio(page); - struct anon_vma *anon_vma = vma->anon_vma; int first; - BUG_ON(!folio_test_locked(folio)); - BUG_ON(!anon_vma); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + first = atomic_inc_and_test(&folio->_entire_mapcount); VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); - if (first) - __folio_set_anon(folio, vma, address, - !!(flags & RMAP_EXCLUSIVE)); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(page); } -- cgit From a1f34ee1de2c3a55bc2a6b9a38e1ecd2830dcc03 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:11 +0200 Subject: mm/rmap: warn on new PTE-mapped folios in page_add_anon_rmap() If swapin code would ever decide to not use order-0 pages and supply a PTE-mapped large folio, we will have to change how we call __folio_set_anon() -- eventually with exclusive=false and an adjusted address. For now, let's add a VM_WARN_ON_FOLIO() with a comment about the situation. Link: https://lkml.kernel.org/r/20230913125113.313322-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 65de3832123e..9b40c3feba3e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1238,6 +1238,13 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, if (unlikely(!folio_test_anon(folio))) { VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + /* + * For a PTE-mapped large folio, we only know that the single + * PTE is exclusive. Further, __folio_set_anon() might not get + * folio->index right when not given the address of the head + * page. + */ + VM_WARN_ON_FOLIO(folio_test_large(folio) && !compound, folio); __folio_set_anon(folio, vma, address, !!(flags & RMAP_EXCLUSIVE)); } else if (likely(!folio_test_ksm(folio))) { -- cgit From 132b180f06a74ddfc526709928036db3b7a1cf6d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:12 +0200 Subject: mm/rmap: simplify PageAnonExclusive sanity checks when adding anon rmap Let's sanity-check PageAnonExclusive vs. mapcount in page_add_anon_rmap() and hugepage_add_anon_rmap() after setting PageAnonExclusive simply by re-reading the mapcounts. We can stop initializing the "first" variable in page_add_anon_rmap() and no longer need an atomic_inc_and_test() in hugepage_add_anon_rmap(). While at it, switch to VM_WARN_ON_FOLIO(). [david@redhat.com: update check for doubly-mapped page] Link: https://lkml.kernel.org/r/d8e5a093-2e22-c14b-7e64-6da280398d9f@redhat.com Link: https://lkml.kernel.org/r/20230913125113.313322-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 9b40c3feba3e..ed4b602bcbd5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1199,7 +1199,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; - bool first = true; + bool first; /* Is page being mapped by PTE? Is this its first map to be added? */ if (likely(!compound)) { @@ -1228,9 +1228,6 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, } } - VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); - VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); - if (nr_pmdmapped) __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped); if (nr) @@ -1252,6 +1249,10 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, } if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(page); + /* While PTE-mapping a THP we have a PMD and a PTE mapping. */ + VM_WARN_ON_FOLIO((atomic_read(&page->_mapcount) > 0 || + (folio_test_large(folio) && folio_entire_mapcount(folio) > 1)) && + PageAnonExclusive(page), folio); mlock_vma_folio(folio, vma, compound); } @@ -2545,15 +2546,14 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { struct folio *folio = page_folio(page); - int first; VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); - first = atomic_inc_and_test(&folio->_entire_mapcount); - VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); - VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); + atomic_inc(&folio->_entire_mapcount); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(page); + VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && + PageAnonExclusive(page), folio); } void hugepage_add_new_anon_rmap(struct folio *folio, -- cgit From 09c550508a4b8f7844b197cc16877dd0f7c42d8f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:13 +0200 Subject: mm/rmap: pass folio to hugepage_add_anon_rmap() Let's pass a folio; we are always mapping the entire thing. Link: https://lkml.kernel.org/r/20230913125113.313322-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/migrate.c | 2 +- mm/rmap.c | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 2053b54556ca..eb6bc4053bc4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -249,7 +249,7 @@ static bool remove_migration_pte(struct folio *folio, pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (folio_test_anon(folio)) - hugepage_add_anon_rmap(new, vma, pvmw.address, + hugepage_add_anon_rmap(folio, vma, pvmw.address, rmap_flags); else page_dup_file_rmap(new, true); diff --git a/mm/rmap.c b/mm/rmap.c index ed4b602bcbd5..d24e2c36372e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2542,18 +2542,16 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) * * RMAP_COMPOUND is ignored. */ -void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, +void hugepage_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { - struct folio *folio = page_folio(page); - VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); if (flags & RMAP_EXCLUSIVE) - SetPageAnonExclusive(page); + SetPageAnonExclusive(&folio->page); VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && - PageAnonExclusive(page), folio); + PageAnonExclusive(&folio->page), folio); } void hugepage_add_new_anon_rmap(struct folio *folio, -- cgit From a8ac4a767dcd9d87d8229045904d9fe15ea5e0e8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:24 +0800 Subject: mm: migrate: remove PageTransHuge check in numamigrate_isolate_page() Patch series "mm: migrate: more folio conversion and unification", v3. Convert more migrate functions to use a folio, it is also a preparation for large folio migration support when balancing numa. This patch (of 8): The assert VM_BUG_ON_PAGE(order && !PageTransHuge(page), page) is not very useful, 1) for a tail/base page, order = 0, for a head page, the order > 0 && PageTransHuge() is true 2) there is a PageCompound() check and only base page is handled in do_numa_page(), and do_huge_pmd_numa_page() only handle PMD-mapped THP 3) even though the page is a tail page, isolate_lru_page() will post a warning, and fail to isolate the page 4) if large folio/pte-mapped THP migration supported in the future, we could migrate the entire folio if numa fault on a tail page so just remove the check. Link: https://lkml.kernel.org/r/20230913095131.2426871-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20230913095131.2426871-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Suggested-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Huang Ying Cc: Hugh Dickins Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index eb6bc4053bc4..d1b8ab5a2417 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2486,8 +2486,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) int nr_pages = thp_nr_pages(page); int order = compound_order(page); - VM_BUG_ON_PAGE(order && !PageTransHuge(page), page); - /* Do not migrate THP mapped by multiple processes */ if (PageTransHuge(page) && total_mapcount(page) > 1) return 0; -- cgit From 728be28fae8c838d52c91dce4867133798146357 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:25 +0800 Subject: mm: migrate: remove THP mapcount check in numamigrate_isolate_page() The check of THP mapped by multiple processes was introduced by commit 04fa5d6a6547 ("mm: migrate: check page_count of THP before migrating") and refactor by commit 340ef3902cf2 ("mm: numa: cleanup flow of transhuge page migration"), which is out of date, since migrate_misplaced_page() is now using the standard migrate_pages() for small pages and THPs, the reference count checking is in folio_migrate_mapping(), so let's remove the special check for THP. Link: https://lkml.kernel.org/r/20230913095131.2426871-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Suggested-by: Matthew Wilcox (Oracle) Reviewed-by: "Huang, Ying" Cc: David Hildenbrand Cc: Hugh Dickins Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index d1b8ab5a2417..0030703204d3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2486,10 +2486,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) int nr_pages = thp_nr_pages(page); int order = compound_order(page); - /* Do not migrate THP mapped by multiple processes */ - if (PageTransHuge(page) && total_mapcount(page) > 1) - return 0; - /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, nr_pages)) { int z; -- cgit From 2ac9e99f3b21b2864305fbfba4bae5913274c409 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:26 +0800 Subject: mm: migrate: convert numamigrate_isolate_page() to numamigrate_isolate_folio() Rename numamigrate_isolate_page() to numamigrate_isolate_folio(), then make it takes a folio and use folio API to save compound_head() calls. Link: https://lkml.kernel.org/r/20230913095131.2426871-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/migrate.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 0030703204d3..1f1aebe8da18 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2481,10 +2481,9 @@ static struct folio *alloc_misplaced_dst_folio(struct folio *src, return __folio_alloc_node(gfp, order, nid); } -static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +static int numamigrate_isolate_folio(pg_data_t *pgdat, struct folio *folio) { - int nr_pages = thp_nr_pages(page); - int order = compound_order(page); + int nr_pages = folio_nr_pages(folio); /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, nr_pages)) { @@ -2496,22 +2495,23 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) if (managed_zone(pgdat->node_zones + z)) break; } - wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); + wakeup_kswapd(pgdat->node_zones + z, 0, + folio_order(folio), ZONE_MOVABLE); return 0; } - if (!isolate_lru_page(page)) + if (!folio_isolate_lru(folio)) return 0; - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), + node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), nr_pages); /* - * Isolating the page has taken another reference, so the - * caller's reference can be safely dropped without the page + * Isolating the folio has taken another reference, so the + * caller's reference can be safely dropped without the folio * disappearing underneath us during migration. */ - put_page(page); + folio_put(folio); return 1; } @@ -2545,7 +2545,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (page_is_file_lru(page) && PageDirty(page)) goto out; - isolated = numamigrate_isolate_page(pgdat, page); + isolated = numamigrate_isolate_folio(pgdat, page_folio(page)); if (!isolated) goto out; -- cgit From 73eab3ca481e5be0f1fd8140365d604482f84ee1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:27 +0800 Subject: mm: migrate: convert migrate_misplaced_page() to migrate_misplaced_folio() At present, numa balance only support base page and PMD-mapped THP, but we will expand to support to migrate large folio/pte-mapped THP in the future, it is better to make migrate_misplaced_page() to take a folio instead of a page, and rename it to migrate_misplaced_folio(), it is a preparation, also this remove several compound_head() calls. Link: https://lkml.kernel.org/r/20230913095131.2426871-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/memory.c | 2 +- mm/migrate.c | 39 +++++++++++++++++++++------------------ 3 files changed, 23 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1cfd83e91748..5c20e43782e4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1567,7 +1567,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) spin_unlock(vmf->ptl); writable = false; - migrated = migrate_misplaced_page(page, vma, target_nid); + migrated = migrate_misplaced_folio(page_folio(page), vma, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; diff --git a/mm/memory.c b/mm/memory.c index 0739ccb00e61..d956b231e835 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4812,7 +4812,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) writable = false; /* Migrate to the requested node */ - if (migrate_misplaced_page(page, vma, target_nid)) { + if (migrate_misplaced_folio(page_folio(page), vma, target_nid)) { page_nid = target_nid; flags |= TNF_MIGRATED; } else { diff --git a/mm/migrate.c b/mm/migrate.c index 1f1aebe8da18..1b848f6b5fbc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2516,55 +2516,58 @@ static int numamigrate_isolate_folio(pg_data_t *pgdat, struct folio *folio) } /* - * Attempt to migrate a misplaced page to the specified destination + * Attempt to migrate a misplaced folio to the specified destination * node. Caller is expected to have an elevated reference count on - * the page that will be dropped by this function before returning. + * the folio that will be dropped by this function before returning. */ -int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, - int node) +int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, + int node) { pg_data_t *pgdat = NODE_DATA(node); int isolated; int nr_remaining; unsigned int nr_succeeded; LIST_HEAD(migratepages); - int nr_pages = thp_nr_pages(page); + int nr_pages = folio_nr_pages(folio); /* - * Don't migrate file pages that are mapped in multiple processes + * Don't migrate file folios that are mapped in multiple processes * with execute permissions as they are probably shared libraries. + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. */ - if (page_mapcount(page) != 1 && page_is_file_lru(page) && + if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) && (vma->vm_flags & VM_EXEC)) goto out; /* - * Also do not migrate dirty pages as not all filesystems can move - * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. + * Also do not migrate dirty folios as not all filesystems can move + * dirty folios in MIGRATE_ASYNC mode which is a waste of cycles. */ - if (page_is_file_lru(page) && PageDirty(page)) + if (folio_is_file_lru(folio) && folio_test_dirty(folio)) goto out; - isolated = numamigrate_isolate_folio(pgdat, page_folio(page)); + isolated = numamigrate_isolate_folio(pgdat, folio); if (!isolated) goto out; - list_add(&page->lru, &migratepages); + list_add(&folio->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, NULL, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED, &nr_succeeded); if (nr_remaining) { if (!list_empty(&migratepages)) { - list_del(&page->lru); - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_lru(page), -nr_pages); - putback_lru_page(page); + list_del(&folio->lru); + node_stat_mod_folio(folio, NR_ISOLATED_ANON + + folio_is_file_lru(folio), -nr_pages); + folio_putback_lru(folio); } isolated = 0; } if (nr_succeeded) { count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); - if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node)) + if (!node_is_toptier(folio_nid(folio)) && node_is_toptier(node)) mod_node_page_state(pgdat, PGPROMOTE_SUCCESS, nr_succeeded); } @@ -2572,7 +2575,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, return isolated; out: - put_page(page); + folio_put(folio); return 0; } #endif /* CONFIG_NUMA_BALANCING */ -- cgit From 7e2a5e5ab217d5e4166cdbdf4af8c5e34b6200bb Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:28 +0800 Subject: mm: migrate: use __folio_test_movable() Use __folio_test_movable(), no need to convert from folio to page again. Link: https://lkml.kernel.org/r/20230913095131.2426871-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Zi Yan Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/migrate.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 1b848f6b5fbc..2a826da603bf 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -157,8 +157,8 @@ void putback_movable_pages(struct list_head *l) list_del(&folio->lru); /* * We isolated non-lru movable folio so here we can use - * __PageMovable because LRU folio's mapping cannot have - * PAGE_MAPPING_MOVABLE. + * __folio_test_movable because LRU folio's mapping cannot + * have PAGE_MAPPING_MOVABLE. */ if (unlikely(__folio_test_movable(folio))) { VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio); @@ -946,7 +946,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) { int rc = -EAGAIN; - bool is_lru = !__PageMovable(&src->page); + bool is_lru = !__folio_test_movable(src); VM_BUG_ON_FOLIO(!folio_test_locked(src), src); VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); @@ -993,7 +993,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, * src is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { - if (__PageMovable(&src->page)) { + if (__folio_test_movable(src)) { VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); /* @@ -1085,7 +1085,7 @@ static void migrate_folio_done(struct folio *src, /* * Compaction can migrate also non-LRU pages which are * not accounted to NR_ISOLATED_*. They can be recognized - * as __PageMovable + * as __folio_test_movable */ if (likely(!__folio_test_movable(src))) mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + @@ -1106,7 +1106,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, int rc = -EAGAIN; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; - bool is_lru = !__PageMovable(&src->page); + bool is_lru = !__folio_test_movable(src); bool locked = false; bool dst_locked = false; @@ -1264,7 +1264,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, int rc; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; - bool is_lru = !__PageMovable(&src->page); + bool is_lru = !__folio_test_movable(src); struct list_head *prev; __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); -- cgit From d64cfccbc805663a2c5691f638cf9198b9676a9f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:29 +0800 Subject: mm: migrate: use a folio in add_page_for_migration() Use a folio in add_page_for_migration() to save compound_head() calls. Link: https://lkml.kernel.org/r/20230913095131.2426871-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/migrate.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 2a826da603bf..ce01d03e635a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2060,6 +2060,7 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, struct vm_area_struct *vma; unsigned long addr; struct page *page; + struct folio *folio; int err; bool isolated; @@ -2082,45 +2083,42 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, if (!page) goto out; - if (is_zone_device_page(page)) - goto out_putpage; + folio = page_folio(page); + if (folio_is_zone_device(folio)) + goto out_putfolio; err = 0; - if (page_to_nid(page) == node) - goto out_putpage; + if (folio_nid(folio) == node) + goto out_putfolio; err = -EACCES; if (page_mapcount(page) > 1 && !migrate_all) - goto out_putpage; + goto out_putfolio; - if (PageHuge(page)) { + if (folio_test_hugetlb(folio)) { if (PageHead(page)) { - isolated = isolate_hugetlb(page_folio(page), pagelist); + isolated = isolate_hugetlb(folio, pagelist); err = isolated ? 1 : -EBUSY; } } else { - struct page *head; - - head = compound_head(page); - isolated = isolate_lru_page(head); + isolated = folio_isolate_lru(folio); if (!isolated) { err = -EBUSY; - goto out_putpage; + goto out_putfolio; } err = 1; - list_add_tail(&head->lru, pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_lru(head), - thp_nr_pages(head)); + list_add_tail(&folio->lru, pagelist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); } -out_putpage: +out_putfolio: /* - * Either remove the duplicate refcount from - * isolate_lru_page() or drop the page ref if it was - * not isolated. + * Either remove the duplicate refcount from folio_isolate_lru() + * or drop the folio ref if it was not isolated. */ - put_page(page); + folio_put(folio); out: mmap_read_unlock(mm); return err; -- cgit From b426ed7889be80359cb4edef142e5c5fa697b068 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:30 +0800 Subject: mm: migrate: remove PageHead() check for HugeTLB in add_page_for_migration() There is some different between hugeTLB and THP behave when passed the address of a tail page, for THP, it will migrate the entire THP page, but for HugeTLB, it will return -EACCES, or -ENOENT before commit e66f17ff7177 ("mm/hugetlb: take page table lock in follow_huge_pmd()"), -EACCES The page is mapped by multiple processes and can be moved only if MPOL_MF_MOVE_ALL is specified. -ENOENT The page is not present. But when check manual[1], both of the two errnos are not suitable, it is better to keep the same behave between hugetlb and THP when passed the address of a tail page, so let's just remove the PageHead() check for HugeTLB. [1] https://man7.org/linux/man-pages/man2/move_pages.2.html Link: https://lkml.kernel.org/r/20230913095131.2426871-8-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Suggested-by: Mike Kravetz Acked-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/migrate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index ce01d03e635a..8495e7e02dd4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2096,10 +2096,8 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, goto out_putfolio; if (folio_test_hugetlb(folio)) { - if (PageHead(page)) { - isolated = isolate_hugetlb(folio, pagelist); - err = isolated ? 1 : -EBUSY; - } + isolated = isolate_hugetlb(folio, pagelist); + err = isolated ? 1 : -EBUSY; } else { isolated = folio_isolate_lru(folio); if (!isolated) { -- cgit From fa1df3f6287e1e1fd8b5309828238e2c728e985f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:31 +0800 Subject: mm: migrate: remove isolated variable in add_page_for_migration() Directly check the return of isolate_hugetlb() and folio_isolate_lru() to remove isolated variable, also setup err = -EBUSY in advance before isolation, and update err only when successfully queued for migration, which could help us to unify and simplify code a bit. Link: https://lkml.kernel.org/r/20230913095131.2426871-9-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/migrate.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 8495e7e02dd4..7d1804c4a5d9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2062,7 +2062,6 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, struct page *page; struct folio *folio; int err; - bool isolated; mmap_read_lock(mm); addr = (unsigned long)untagged_addr_remote(mm, p); @@ -2095,15 +2094,13 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, if (page_mapcount(page) > 1 && !migrate_all) goto out_putfolio; + err = -EBUSY; if (folio_test_hugetlb(folio)) { - isolated = isolate_hugetlb(folio, pagelist); - err = isolated ? 1 : -EBUSY; + if (isolate_hugetlb(folio, pagelist)) + err = 1; } else { - isolated = folio_isolate_lru(folio); - if (!isolated) { - err = -EBUSY; + if (!folio_isolate_lru(folio)) goto out_putfolio; - } err = 1; list_add_tail(&folio->lru, pagelist); -- cgit From c603c630b509d690d470b9b49eb96f40482f47d5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 13 Sep 2023 02:20:49 +0000 Subject: mm/damon/core: add a tracepoint for damos apply target regions Patch series "mm/damon: add a tracepoint for damos apply target regions", v2. DAMON provides damon_aggregated tracepoint to let users record full monitoring results. Sometimes, users need to record monitoring results of specific pattern. DAMOS tried regions directory of DAMON sysfs interface allows it, but the interface is mainly designed for snapshots and therefore would be inefficient for such recording. Implement yet another tracepoint for efficient support of the usecase. This patch (of 2): DAMON provides damon_aggregated tracepoint, which exposes details of each region and its access monitoring results. It is useful for getting whole monitoring results, e.g., for recording purposes. For investigations of DAMOS, DAMON Sysfs interface provides DAMOS statistics and tried_regions directory. But, those provides only statistics and snapshots. If the scheme is frequently applied and if the user needs to know every detail of DAMOS behavior, the snapshot-based interface could be insufficient and expensive. As a last resort, userspace users need to record the all monitoring results via damon_aggregated tracepoint and simulate how DAMOS would worked. It is unnecessarily complicated. DAMON kernel API users, meanwhile, can do that easily via before_damos_apply() callback field of 'struct damon_callback', though. Add a tracepoint that will be called just after before_damos_apply() callback for more convenient investigations of DAMOS. The tracepoint exposes all details about each regions, similar to damon_aggregated tracepoint. Please note that DAMOS is currently not only for memory management but also for query-like efficient monitoring results retrievals (when 'stat' action is used). Until now, only statistics or snapshots were supported. Addition of this tracepoint allows efficient full recording of DAMOS-based filtered monitoring results. Link: https://lkml.kernel.org/r/20230913022050.2109-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230913022050.2109-2-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Steven Rostedt (Google) [tracing] Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/core.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index ca631dd88b33..3ca34a252a3c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -950,6 +950,33 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct timespec64 begin, end; unsigned long sz_applied = 0; int err = 0; + /* + * We plan to support multiple context per kdamond, as DAMON sysfs + * implies with 'nr_contexts' file. Nevertheless, only single context + * per kdamond is supported for now. So, we can simply use '0' context + * index here. + */ + unsigned int cidx = 0; + struct damos *siter; /* schemes iterator */ + unsigned int sidx = 0; + struct damon_target *titer; /* targets iterator */ + unsigned int tidx = 0; + bool do_trace = false; + + /* get indices for trace_damos_before_apply() */ + if (trace_damos_before_apply_enabled()) { + damon_for_each_scheme(siter, c) { + if (siter == s) + break; + sidx++; + } + damon_for_each_target(titer, c) { + if (titer == t) + break; + tidx++; + } + do_trace = true; + } if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { @@ -964,8 +991,11 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, ktime_get_coarse_ts64(&begin); if (c->callback.before_damos_apply) err = c->callback.before_damos_apply(c, t, r, s); - if (!err) + if (!err) { + trace_damos_before_apply(cidx, sidx, tidx, r, + damon_nr_regions(t), do_trace); sz_applied = c->ops.apply_scheme(c, t, r, s); + } ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); -- cgit From 3dfbb555c98ac55b9d911f9af0e35014b445fb41 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 14 Sep 2023 15:16:39 +0200 Subject: mm, vmscan: remove ISOLATE_UNMAPPED This isolate_mode_t flag is effectively unused since 89f6c88a6ab4 ("mm: __isolate_lru_page_prepare() in isolate_migratepages_block()") as sc->may_unmap is now checked directly (and only node_reclaim has a mode that sets it to 0). The last remaining place is mm_vmscan_lru_isolate tracepoint for the isolate_mode parameter. That one was mainly used to indicate the active/inactive mode, which the trace-vmscan-postprocess.pl script consumed, but that got silently broken. After fixing the script by the previous patch, it does not need the isolate_mode anymore. So just remove the parameter and with that the whole ISOLATE_UNMAPPED flag. Link: https://lkml.kernel.org/r/20230914131637.12204-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Hugh Dickins Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index acf115468bf8..3df0e2a59052 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1689,8 +1689,7 @@ move: } *nr_scanned = total_scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, - total_scan, skipped, nr_taken, - sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); + total_scan, skipped, nr_taken, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } -- cgit From 2e7cfe5cd5b6b0b98abf57a3074885979e187c1c Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:44 -0400 Subject: mm/cma: use nth_page() in place of direct struct page manipulation Patch series "Use nth_page() in place of direct struct page manipulation", v3. On SPARSEMEM without VMEMMAP, struct page is not guaranteed to be contiguous, since each memory section's memmap might be allocated independently. hugetlb pages can go beyond a memory section size, thus direct struct page manipulation on hugetlb pages/subpages might give wrong struct page. Kernel provides nth_page() to do the manipulation properly. Use that whenever code can see hugetlb pages. This patch (of 5): When dealing with hugetlb pages, manipulating struct page pointers directly can get to wrong struct page, since struct page is not guaranteed to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle it properly. Without the fix, page_kasan_tag_reset() could reset wrong page tags, causing a wrong kasan result. No related bug is reported. The fix comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20230913201248.452081-2-zi.yan@sent.com Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc") Signed-off-by: Zi Yan Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- mm/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index da2967c6a223..2b2494fd6b59 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -505,7 +505,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, */ if (page) { for (i = 0; i < count; i++) - page_kasan_tag_reset(page + i); + page_kasan_tag_reset(nth_page(page, i)); } if (ret && !no_warn) { -- cgit From 426056efe835cf4864ccf4c328fe3af9146fc539 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:45 -0400 Subject: mm/hugetlb: use nth_page() in place of direct struct page manipulation When dealing with hugetlb pages, manipulating struct page pointers directly can get to wrong struct page, since struct page is not guaranteed to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle it properly. A wrong or non-existing page might be tried to be grabbed, either leading to a non freeable page or kernel memory access errors. No bug is reported. It comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-3-zi.yan@sent.com Fixes: 57a196a58421 ("hugetlb: simplify hugetlb handling in follow_page_mask") Signed-off-by: Zi Yan Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c90c43574a6..a945efe2858a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6493,7 +6493,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, } } - page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); /* * Note that page may be a sub-page, and with vmemmap -- cgit From 1640a0ef80f6d572725f5b0330038c18e98ea168 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:46 -0400 Subject: mm/memory_hotplug: use pfn math in place of direct struct page manipulation When dealing with hugetlb pages, manipulating struct page pointers directly can get to wrong struct page, since struct page is not guaranteed to be contiguous on SPARSEMEM without VMEMMAP. Use pfn calculation to handle it properly. Without the fix, a wrong number of page might be skipped. Since skip cannot be negative, scan_movable_page() will end early and might miss a movable page with -ENOENT. This might fail offline_pages(). No bug is reported. The fix comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-4-zi.yan@sent.com Fixes: eeb0efd071d8 ("mm,memory_hotplug: fix scan_movable_pages() for gigantic hugepages") Signed-off-by: Zi Yan Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1b03f4ec6fd2..3b301c4023ff 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1689,7 +1689,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end, */ if (HPageMigratable(head)) goto found; - skip = compound_nr(head) - (page - head); + skip = compound_nr(head) - (pfn - page_to_pfn(head)); pfn += skip - 1; } return -ENOENT; -- cgit From 4472edf63d6630e6cf65e205b4fc8c3c94d0afe5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 14 Sep 2023 02:15:23 +0000 Subject: mm/damon/core: use number of passed access sampling as a timer DAMON sleeps for sampling interval after each sampling, and check if the aggregation interval and the ops update interval have passed using ktime_get_coarse_ts64() and baseline timestamps for the intervals. That design is for making the operations occur at deterministic timing regardless of the time that spend for each work. However, it turned out it is not that useful, and incur not-that-intuitive results. After all, timer functions, and especially sleep functions that DAMON uses to wait for specific timing, are not necessarily strictly accurate. It is legal design, so no problem. However, depending on such inaccuracies, the nr_accesses can be larger than aggregation interval divided by sampling interval. For example, with the default setting (5 ms sampling interval and 100 ms aggregation interval) we frequently show regions having nr_accesses larger than 20. Also, if the execution of a DAMOS scheme takes a long time, next aggregation could happen before enough number of samples are collected. This is not what usual users would intuitively expect. Since access check sampling is the smallest unit work of DAMON, using the number of passed sampling intervals as the DAMON-internal timer can easily avoid these problems. That is, convert aggregation and ops update intervals to numbers of sampling intervals that need to be passed before those operations be executed, count the number of passed sampling intervals, and invoke the operations as soon as the specific amount of sampling intervals passed. Make the change. Note that this could make a behavioral change to settings that using intervals that not aligned by the sampling interval. For example, if the sampling interval is 5 ms and the aggregation interval is 12 ms, DAMON effectively uses 15 ms as its aggregation interval, because it checks whether the aggregation interval after sleeping the sampling interval. This change will make DAMON to effectively use 10 ms as aggregation interval, since it uses 'aggregation interval / sampling interval * sampling interval' as the effective aggregation interval, and we don't use floating point types. Usual users would have used aligned intervals, so this behavioral change is not expected to make any meaningful impact, so just make this change. Link: https://lkml.kernel.org/r/20230914021523.60649-1-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 96 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 47 insertions(+), 49 deletions(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index 3ca34a252a3c..c5b7296c69a0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -427,8 +427,10 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.aggr_interval = 100 * 1000; ctx->attrs.ops_update_interval = 60 * 1000 * 1000; - ktime_get_coarse_ts64(&ctx->last_aggregation); - ctx->last_ops_update = ctx->last_aggregation; + ctx->passed_sample_intervals = 0; + /* These will be set from kdamond_init_intervals_sis() */ + ctx->next_aggregation_sis = 0; + ctx->next_ops_update_sis = 0; mutex_init(&ctx->kdamond_lock); @@ -552,6 +554,9 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx, */ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) { + unsigned long sample_interval = attrs->sample_interval ? + attrs->sample_interval : 1; + if (attrs->min_nr_regions < 3) return -EINVAL; if (attrs->min_nr_regions > attrs->max_nr_regions) @@ -559,6 +564,11 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) if (attrs->sample_interval > attrs->aggr_interval) return -EINVAL; + ctx->next_aggregation_sis = ctx->passed_sample_intervals + + attrs->aggr_interval / sample_interval; + ctx->next_ops_update_sis = ctx->passed_sample_intervals + + attrs->ops_update_interval / sample_interval; + damon_update_monitoring_results(ctx, attrs); ctx->attrs = *attrs; return 0; @@ -732,38 +742,6 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) return err; } -/* - * damon_check_reset_time_interval() - Check if a time interval is elapsed. - * @baseline: the time to check whether the interval has elapsed since - * @interval: the time interval (microseconds) - * - * See whether the given time interval has passed since the given baseline - * time. If so, it also updates the baseline to current time for next check. - * - * Return: true if the time interval has passed, or false otherwise. - */ -static bool damon_check_reset_time_interval(struct timespec64 *baseline, - unsigned long interval) -{ - struct timespec64 now; - - ktime_get_coarse_ts64(&now); - if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) < - interval * 1000) - return false; - *baseline = now; - return true; -} - -/* - * Check whether it is time to flush the aggregated information - */ -static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) -{ - return damon_check_reset_time_interval(&ctx->last_aggregation, - ctx->attrs.aggr_interval); -} - /* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ @@ -1274,18 +1252,6 @@ static void kdamond_split_regions(struct damon_ctx *ctx) last_nr_regions = nr_regions; } -/* - * Check whether it is time to check and apply the operations-related data - * structures. - * - * Returns true if it is. - */ -static bool kdamond_need_update_operations(struct damon_ctx *ctx) -{ - return damon_check_reset_time_interval(&ctx->last_ops_update, - ctx->attrs.ops_update_interval); -} - /* * Check whether current monitoring should be stopped * @@ -1397,6 +1363,17 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) return -EBUSY; } +static void kdamond_init_intervals_sis(struct damon_ctx *ctx) +{ + unsigned long sample_interval = ctx->attrs.sample_interval ? + ctx->attrs.sample_interval : 1; + + ctx->passed_sample_intervals = 0; + ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval; + ctx->next_ops_update_sis = ctx->attrs.ops_update_interval / + sample_interval; +} + /* * The monitoring daemon that runs as a kernel thread */ @@ -1410,6 +1387,8 @@ static int kdamond_fn(void *data) pr_debug("kdamond (%d) starts\n", current->pid); + kdamond_init_intervals_sis(ctx); + if (ctx->ops.init) ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) @@ -1418,6 +1397,17 @@ static int kdamond_fn(void *data) sz_limit = damon_region_sz_limit(ctx); while (!kdamond_need_stop(ctx)) { + /* + * ctx->attrs and ctx->next_{aggregation,ops_update}_sis could + * be changed from after_wmarks_check() or after_aggregation() + * callbacks. Read the values here, and use those for this + * iteration. That is, damon_set_attrs() updated new values + * are respected from next iteration. + */ + unsigned long next_aggregation_sis = ctx->next_aggregation_sis; + unsigned long next_ops_update_sis = ctx->next_ops_update_sis; + unsigned long sample_interval = ctx->attrs.sample_interval; + if (kdamond_wait_activation(ctx)) break; @@ -1427,12 +1417,17 @@ static int kdamond_fn(void *data) ctx->callback.after_sampling(ctx)) break; - kdamond_usleep(ctx->attrs.sample_interval); + kdamond_usleep(sample_interval); + ctx->passed_sample_intervals++; if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - if (kdamond_aggregate_interval_passed(ctx)) { + sample_interval = ctx->attrs.sample_interval ? + ctx->attrs.sample_interval : 1; + if (ctx->passed_sample_intervals == next_aggregation_sis) { + ctx->next_aggregation_sis = next_aggregation_sis + + ctx->attrs.aggr_interval / sample_interval; kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); @@ -1447,7 +1442,10 @@ static int kdamond_fn(void *data) ctx->ops.reset_aggregated(ctx); } - if (kdamond_need_update_operations(ctx)) { + if (ctx->passed_sample_intervals == next_ops_update_sis) { + ctx->next_ops_update_sis = next_ops_update_sis + + ctx->attrs.ops_update_interval / + sample_interval; if (ctx->ops.update) ctx->ops.update(ctx); sz_limit = damon_region_sz_limit(ctx); -- cgit From 78fbfb155d204428119310d1b9df665ab88da6e8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:44 +0000 Subject: mm/damon/core: define and use a dedicated function for region access rate update Patch series "mm/damon: provide pseudo-moving sum based access rate". DAMON checks the access to each region for every sampling interval, increase the access rate counter of the region, namely nr_accesses, if the access was made. For every aggregation interval, the counter is reset. The counter is exposed to users to be used as a metric showing the relative access rate (frequency) of each region. In other words, DAMON provides access rate of each region in every aggregation interval. The aggregation avoids temporal access pattern changes making things confusing. However, this also makes a few DAMON-related operations to unnecessarily need to be aligned to the aggregation interval. This can restrict the flexibility of DAMON applications, especially when the aggregation interval is huge. To provide the monitoring results in finer-grained timing while keeping handling of temporal access pattern change, this patchset implements a pseudo-moving sum based access rate metric. It is pseudo-moving sum because strict moving sum implementation would need to keep all values for last time window, and that could incur high overhead of there could be arbitrary number of values in a time window. Especially in case of the nr_accesses, since the sampling interval and aggregation interval can arbitrarily set and the past values should be maintained for every region, it could be risky. The pseudo-moving sum assumes there were no temporal access pattern change in last discrete time window to remove the needs for keeping the list of the last time window values. As a result, it beocmes not strict moving sum implementation, but provides a reasonable accuracy. Also, it keeps an important property of the moving sum. That is, the moving sum becomes same to discrete-window based sum at the time that aligns to the time window. This means using the pseudo moving sum based nr_accesses makes no change to users who shows the value for every aggregation interval. Patches Sequence ---------------- The sequence of the patches is as follows. The first four patches are for preparation of the change. The first two (patches 1 and 2) implements a helper function for nr_accesses update and eliminate corner case that skips use of the function, respectively. Following two (patches 3 and 4) respectively implement the pseudo-moving sum function and its simple unit test case. Two patches for making DAMON to use the pseudo-moving sum follow. The fifthe one (patch 5) introduces a new field for representing the pseudo-moving sum-based access rate of each region, and the sixth one makes the new representation to actually updated with the pseudo-moving sum function. Last two patches (patches 7 and 8) makes followup fixes for skipping unnecessary updates and marking the moving sum function as static, respectively. This patch (of 8): Each DAMON operarions set is updating nr_accesses field of each damon_region for each of their access check results, from the check_accesses() callback. Directly accessing the field could make things complex to manage and change in future. Define and use a dedicated function for the purpose. Link: https://lkml.kernel.org/r/20230915025251.72816-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230915025251.72816-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core.c | 16 ++++++++++++++++ mm/damon/paddr.c | 6 ++---- mm/damon/vaddr.c | 6 ++---- 3 files changed, 20 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index c5b7296c69a0..10532159323a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1549,6 +1549,22 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, return damon_set_regions(t, &addr_range, 1); } +/** + * damon_update_region_access_rate() - Update the access rate of a region. + * @r: The DAMON region to update for its access check result. + * @accessed: Whether the region has accessed during last sampling interval. + * + * Update the access rate of a region with the region's last sampling interval + * access check result. + * + * Usually this will be called by &damon_operations->check_accesses callback. + */ +void damon_update_region_access_rate(struct damon_region *r, bool accessed) +{ + if (accessed) + r->nr_accesses++; +} + static int __init damon_init(void) { damon_region_cache = KMEM_CACHE(damon_region, 0); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 909db25efb35..44f21860b555 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -157,14 +157,12 @@ static void __damon_pa_check_access(struct damon_region *r) /* If the region is in the last checked page, reuse the result */ if (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz)) { - if (last_accessed) - r->nr_accesses++; + damon_update_region_access_rate(r, last_accessed); return; } last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz); - if (last_accessed) - r->nr_accesses++; + damon_update_region_access_rate(r, last_accessed); last_addr = r->sampling_addr; } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index cf8a9fc5c9d1..53371bbec605 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -567,14 +567,12 @@ static void __damon_va_check_access(struct mm_struct *mm, /* If the region is in the last checked page, reuse the result */ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { - if (last_accessed) - r->nr_accesses++; + damon_update_region_access_rate(r, last_accessed); return; } last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz); - if (last_accessed) - r->nr_accesses++; + damon_update_region_access_rate(r, last_accessed); last_addr = r->sampling_addr; } -- cgit From 22a7788038a660df8876ba54b941b418d2178f99 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:45 +0000 Subject: mm/damon/vaddr: call damon_update_region_access_rate() always When getting mm_struct of the monitoring target process fails, there wil be no need to increase the access rate counter (nr_accesses) of the regions for the process. Hence, damon_va_check_accesses() skips calling damon_update_region_access_rate() in the case. This breaks the assumption that damon_update_region_access_rate() is called for every region, for every sampling interval. Call the function for every region even in the case. This might increase the overhead in some cases, but such case would not be frequent, so no significant impact is really expected. Link: https://lkml.kernel.org/r/20230915025251.72816-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 53371bbec605..02ab448d9b1e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -564,6 +564,11 @@ static void __damon_va_check_access(struct mm_struct *mm, static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; + if (!mm) { + damon_update_region_access_rate(r, false); + return; + } + /* If the region is in the last checked page, reuse the result */ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { @@ -587,15 +592,14 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { mm = damon_get_mm(t); - if (!mm) - continue; same_target = false; damon_for_each_region(r, t) { __damon_va_check_access(mm, r, same_target); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); same_target = true; } - mmput(mm); + if (mm) + mmput(mm); } return max_nr_accesses; -- cgit From d2c062ade07ffd206dd16bf085f02abc59651309 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:46 +0000 Subject: mm/damon/core: implement a pseudo-moving sum function For values that continuously change, moving average or sum are good ways to provide fast updates while handling temporal and errorneous variability of the value. For example, the access rate counter (nr_accesses) is calculated as a sum of the number of positive sampled access check results that collected during a discrete time window (aggregation interval), and hence it handles temporal and errorneous access check results, but provides the update only for every aggregation interval. Using a moving sum method for that could allow providing the value for every sampling interval. That could be useful for getting monitoring results snapshot or running DAMOS in fine-grained timing. However, supporting the moving sum for cases that number of samples in the time window is arbirary could impose high overhead, since the number of past values that it needs to keep could be too high. The nr_accesses would also be one of the cases. To mitigate the overhead, implement a pseudo-moving sum function that only provides an estimated pseudo-moving sum. It assumes there was no error in last discrete time window and subtract constant portion of last discrete time window sum. Note that the function is not strictly implementing the moving sum, but it keeps a property of moving sum, which makes the value same to the dsicrete-window based sum for each time window-aligned timing. Hence, people collecting the value in the old timings would show no difference. Link: https://lkml.kernel.org/r/20230915025251.72816-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index 10532159323a..b005dc15009f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1549,6 +1549,46 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, return damon_set_regions(t, &addr_range, 1); } +/* + * damon_moving_sum() - Calculate an inferred moving sum value. + * @mvsum: Inferred sum of the last @len_window values. + * @nomvsum: Non-moving sum of the last discrete @len_window window values. + * @len_window: The number of last values to take care of. + * @new_value: New value that will be added to the pseudo moving sum. + * + * Moving sum (moving average * window size) is good for handling noise, but + * the cost of keeping past values can be high for arbitrary window size. This + * function implements a lightweight pseudo moving sum function that doesn't + * keep the past window values. + * + * It simply assumes there was no noise in the past, and get the no-noise + * assumed past value to drop from @nomvsum and @len_window. @nomvsum is a + * non-moving sum of the last window. For example, if @len_window is 10 and we + * have 25 values, @nomvsum is the sum of the 11th to 20th values of the 25 + * values. Hence, this function simply drops @nomvsum / @len_window from + * given @mvsum and add @new_value. + * + * For example, if @len_window is 10 and @nomvsum is 50, the last 10 values for + * the last window could be vary, e.g., 0, 10, 0, 10, 0, 10, 0, 0, 0, 20. For + * calculating next moving sum with a new value, we should drop 0 from 50 and + * add the new value. However, this function assumes it got value 5 for each + * of the last ten times. Based on the assumption, when the next value is + * measured, it drops the assumed past value, 5 from the current sum, and add + * the new value to get the updated pseduo-moving average. + * + * This means the value could have errors, but the errors will be disappeared + * for every @len_window aligned calls. For example, if @len_window is 10, the + * pseudo moving sum with 11th value to 19th value would have an error. But + * the sum with 20th value will not have the error. + * + * Return: Pseudo-moving average after getting the @new_value. + */ +unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, + unsigned int len_window, unsigned int new_value) +{ + return mvsum - nomvsum / len_window + new_value; +} + /** * damon_update_region_access_rate() - Update the access rate of a region. * @r: The DAMON region to update for its access check result. -- cgit From 0926e8ff96b58845e802438374caaca53ab36053 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:47 +0000 Subject: mm/damon/core-test: add a unit test for damon_moving_sum() Add a simple unit test for the pseudo moving-sum function (damon_moving_sum()). Link: https://lkml.kernel.org/r/20230915025251.72816-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm') diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 6cc8b245586d..c539f0e8377e 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -341,6 +341,21 @@ static void damon_test_set_attrs(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); } +static void damon_test_moving_sum(struct kunit *test) +{ + unsigned int mvsum = 50000, nomvsum = 50000, len_window = 10; + unsigned int new_values[] = {10000, 0, 10000, 0, 0, 0, 10000, 0, 0, 0}; + unsigned int expects[] = {55000, 50000, 55000, 50000, 45000, 40000, + 45000, 40000, 35000, 30000}; + int i; + + for (i = 0; i < ARRAY_SIZE(new_values); i++) { + mvsum = damon_moving_sum(mvsum, nomvsum, len_window, + new_values[i]); + KUNIT_EXPECT_EQ(test, mvsum, expects[i]); + } +} + static void damos_test_new_filter(struct kunit *test) { struct damos_filter *filter; @@ -425,6 +440,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_set_regions), KUNIT_CASE(damon_test_update_monitoring_result), KUNIT_CASE(damon_test_set_attrs), + KUNIT_CASE(damon_test_moving_sum), KUNIT_CASE(damos_test_new_filter), KUNIT_CASE(damos_test_filter_out), {}, -- cgit From 80333828ea7728ebe85d079bb5c1467eb9fc6c8c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:48 +0000 Subject: mm/damon/core: introduce nr_accesses_bp Add yet another representation of the access rate of each region, namely nr_accesses_bp. It is just same to the nr_accesses but represents the value in basis point (1 in 10,000), and updated at once in every aggregation interval. That is, moving_accesses_bp is just nr_accesses * 10000. This may seems useless at the moment. However, it will be useful for representing less than one nr_accesses value that will be needed to make moving sum-based nr_accesses. Link: https://lkml.kernel.org/r/20230915025251.72816-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 5 +++++ mm/damon/core.c | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index c539f0e8377e..79f1f12e0dd5 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -94,6 +94,7 @@ static void damon_test_aggregate(struct kunit *test) for (ir = 0; ir < 3; ir++) { r = damon_new_region(saddr[it][ir], eaddr[it][ir]); r->nr_accesses = accesses[it][ir]; + r->nr_accesses_bp = accesses[it][ir] * 10000; damon_add_region(r, t); } it++; @@ -147,9 +148,11 @@ static void damon_test_merge_two(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 100); r->nr_accesses = 10; + r->nr_accesses_bp = 100000; damon_add_region(r, t); r2 = damon_new_region(100, 300); r2->nr_accesses = 20; + r2->nr_accesses_bp = 200000; damon_add_region(r2, t); damon_merge_two_regions(t, r, r2); @@ -196,6 +199,7 @@ static void damon_test_merge_regions_of(struct kunit *test) for (i = 0; i < ARRAY_SIZE(sa); i++) { r = damon_new_region(sa[i], ea[i]); r->nr_accesses = nrs[i]; + r->nr_accesses_bp = nrs[i] * 10000; damon_add_region(r, t); } @@ -297,6 +301,7 @@ static void damon_test_update_monitoring_result(struct kunit *test) struct damon_region *r = damon_new_region(3, 7); r->nr_accesses = 15; + r->nr_accesses_bp = 150000; r->age = 20; new_attrs = (struct damon_attrs){ diff --git a/mm/damon/core.c b/mm/damon/core.c index b005dc15009f..ce85c00b0a4c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -128,6 +128,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) region->ar.start = start; region->ar.end = end; region->nr_accesses = 0; + region->nr_accesses_bp = 0; INIT_LIST_HEAD(®ion->list); region->age = 0; @@ -508,6 +509,7 @@ static void damon_update_monitoring_result(struct damon_region *r, { r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses, old_attrs, new_attrs); + r->nr_accesses_bp = r->nr_accesses * 10000; r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs); } @@ -1115,6 +1117,7 @@ static void damon_merge_two_regions(struct damon_target *t, l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); + l->nr_accesses_bp = l->nr_accesses * 10000; l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); l->ar.end = r->ar.end; damon_destroy_region(r, t); @@ -1138,6 +1141,8 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, else r->age++; + r->nr_accesses_bp = r->nr_accesses * 10000; + if (prev && prev->ar.end == r->ar.start && abs(prev->nr_accesses - r->nr_accesses) <= thres && damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) @@ -1186,6 +1191,7 @@ static void damon_split_region_at(struct damon_target *t, new->age = r->age; new->last_nr_accesses = r->last_nr_accesses; + new->nr_accesses_bp = r->nr_accesses_bp; damon_insert_region(new, r, damon_next_region(r), t); } -- cgit From ace30fb21af5f1be1605db72c16040b95b1557ef Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:49 +0000 Subject: mm/damon/core: use pseudo-moving sum for nr_accesses_bp Let nr_accesses_bp be calculated as a pseudo-moving sum that updated for every sampling interval, using damon_moving_sum(). This is assumed to be useful for cases that the aggregation interval is set quite huge, but the monivoting results need to be collected earlier than next aggregation interval is passed. Link: https://lkml.kernel.org/r/20230915025251.72816-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core.c | 16 +++++++++++++++- mm/damon/paddr.c | 9 +++++---- mm/damon/vaddr.c | 12 +++++++----- 3 files changed, 27 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index ce85c00b0a4c..29ee1fc18393 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1599,14 +1599,28 @@ unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, * damon_update_region_access_rate() - Update the access rate of a region. * @r: The DAMON region to update for its access check result. * @accessed: Whether the region has accessed during last sampling interval. + * @attrs: The damon_attrs of the DAMON context. * * Update the access rate of a region with the region's last sampling interval * access check result. * * Usually this will be called by &damon_operations->check_accesses callback. */ -void damon_update_region_access_rate(struct damon_region *r, bool accessed) +void damon_update_region_access_rate(struct damon_region *r, bool accessed, + struct damon_attrs *attrs) { + unsigned int len_window = 1; + + /* + * sample_interval can be zero, but cannot be larger than + * aggr_interval, owing to validation of damon_set_attrs(). + */ + if (attrs->sample_interval) + len_window = attrs->aggr_interval / attrs->sample_interval; + r->nr_accesses_bp = damon_moving_sum(r->nr_accesses_bp, + r->last_nr_accesses * 10000, len_window, + accessed ? 10000 : 0); + if (accessed) r->nr_accesses++; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 44f21860b555..081e2a325778 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -148,7 +148,8 @@ out: return accessed; } -static void __damon_pa_check_access(struct damon_region *r) +static void __damon_pa_check_access(struct damon_region *r, + struct damon_attrs *attrs) { static unsigned long last_addr; static unsigned long last_folio_sz = PAGE_SIZE; @@ -157,12 +158,12 @@ static void __damon_pa_check_access(struct damon_region *r) /* If the region is in the last checked page, reuse the result */ if (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz)) { - damon_update_region_access_rate(r, last_accessed); + damon_update_region_access_rate(r, last_accessed, attrs); return; } last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz); - damon_update_region_access_rate(r, last_accessed); + damon_update_region_access_rate(r, last_accessed, attrs); last_addr = r->sampling_addr; } @@ -175,7 +176,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) { - __damon_pa_check_access(r); + __damon_pa_check_access(r, &ctx->attrs); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 02ab448d9b1e..a4d1f63c5b23 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -558,26 +558,27 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * r the region to be checked */ static void __damon_va_check_access(struct mm_struct *mm, - struct damon_region *r, bool same_target) + struct damon_region *r, bool same_target, + struct damon_attrs *attrs) { static unsigned long last_addr; static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; if (!mm) { - damon_update_region_access_rate(r, false); + damon_update_region_access_rate(r, false, attrs); return; } /* If the region is in the last checked page, reuse the result */ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { - damon_update_region_access_rate(r, last_accessed); + damon_update_region_access_rate(r, last_accessed, attrs); return; } last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz); - damon_update_region_access_rate(r, last_accessed); + damon_update_region_access_rate(r, last_accessed, attrs); last_addr = r->sampling_addr; } @@ -594,7 +595,8 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) mm = damon_get_mm(t); same_target = false; damon_for_each_region(r, t) { - __damon_va_check_access(mm, r, same_target); + __damon_va_check_access(mm, r, same_target, + &ctx->attrs); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); same_target = true; } -- cgit From 401807a316bb913f5eefcaf8343575ec9296d6b1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:50 +0000 Subject: mm/damon/core: skip updating nr_accesses_bp for each aggregation interval damon_merge_regions_of(), which is called for each aggregation interval, updates nr_accesses_bp to nr_accesses * 10000. However, nr_accesses_bp is updated for each sampling interval via damon_moving_sum() using the aggregation interval as the moving time window. And by the definition of the algorithm, the value becomes same to discrete-window based sum for each time window-aligned time. Hence, nr_accesses_bp will be same to nr_accesses * 10000 for each aggregation interval without explicit update. Remove the unnecessary update of nr_accesses_bp in damon_merge_regions_of(). Link: https://lkml.kernel.org/r/20230915025251.72816-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index 29ee1fc18393..45cc108c0fe1 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1141,8 +1141,6 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, else r->age++; - r->nr_accesses_bp = r->nr_accesses * 10000; - if (prev && prev->ar.end == r->ar.start && abs(prev->nr_accesses - r->nr_accesses) <= thres && damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) -- cgit From 863803a7948c8e33e6a7b002017747ca83ecfd63 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:51 +0000 Subject: mm/damon/core: mark damon_moving_sum() as a static function The function is used by only mm/damon/core.c. Mark it as a static function. Link: https://lkml.kernel.org/r/20230915025251.72816-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index 45cc108c0fe1..b15cf47d2d29 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1587,7 +1587,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, * * Return: Pseudo-moving average after getting the @new_value. */ -unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, +static unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, unsigned int len_window, unsigned int new_value) { return mvsum - nomvsum / len_window + new_value; -- cgit From a9e34ea1f62c1e359ed197ec01d056c25edb2b61 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 13 Sep 2023 11:53:58 +0100 Subject: mm: hugetlb_vmemmap: use nid of the head page to reallocate it Patch series "mm: hugetlb: Skip initialization of gigantic tail struct pages if freed by HVO", v5. This series moves the boot time initialization of tail struct pages of a gigantic page to later on in the boot. Only the HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page) - 1 tail struct pages are initialized at the start. If HVO is successful, then no more tail struct pages need to be initialized. For a 1G hugepage, this series avoid initialization of 262144 - 63 = 262081 struct pages per hugepage. When tested on a 512G system (allocating 500 1G hugepages), the kexec-boot times with DEFERRED_STRUCT_PAGE_INIT enabled are: - with patches, HVO enabled: 1.32 seconds - with patches, HVO disabled: 2.15 seconds - without patches, HVO enabled: 3.90 seconds - without patches, HVO disabled: 3.58 seconds This represents an approximately 70% reduction in boot time and will significantly reduce server downtime when using a large number of gigantic pages. This patch (of 4): If tail page prep and initialization is skipped, then the "start" page will not contain the correct nid. Use the nid from first vmemap page. Link: https://lkml.kernel.org/r/20230913105401.519709-1-usama.arif@bytedance.com Link: https://lkml.kernel.org/r/20230913105401.519709-2-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Fam Zheng Cc: Mike Rapoport (IBM) Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 0bde38626d25..ad650c7b07ec 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -319,7 +319,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, }; - int nid = page_to_nid((struct page *)start); + int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; /* -- cgit From ee8d2071ef52d83a2ac4f8a474fafb2aea91766d Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 13 Sep 2023 11:53:59 +0100 Subject: memblock: pass memblock_type to memblock_setclr_flag This allows setting flags to both memblock types and is in preparation for setting flags (for e.g. to not initialize struct pages) on reserved memory region. [usama.arif@bytedance.com: add missing argument definition] Link: https://lkml.kernel.org/r/20230918090657.220463-1-usama.arif@bytedance.com Link: https://lkml.kernel.org/r/20230913105401.519709-3-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Reviewed-by: Mike Rapoport (IBM) Acked-by: Mike Kravetz Cc: Fam Zheng Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/memblock.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 913b2520a9a0..b978cda96cf0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -892,6 +892,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) /** * memblock_setclr_flag - set or clear flag for a memory region + * @type: memblock type to set/clear flag for * @base: base address of the region * @size: size of the region * @set: set or clear the flag @@ -901,10 +902,9 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) * * Return: 0 on success, -errno on failure. */ -static int __init_memblock memblock_setclr_flag(phys_addr_t base, - phys_addr_t size, int set, int flag) +static int __init_memblock memblock_setclr_flag(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, int set, int flag) { - struct memblock_type *type = &memblock.memory; int i, ret, start_rgn, end_rgn; ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); @@ -933,7 +933,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, */ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG); + return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_HOTPLUG); } /** @@ -945,7 +945,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); + return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_HOTPLUG); } /** @@ -962,7 +962,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) system_has_some_mirror = true; - return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); + return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_MIRROR); } /** @@ -982,7 +982,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); + return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_NOMAP); } /** @@ -994,7 +994,7 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); + return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_NOMAP); } static bool should_skip_region(struct memblock_type *type, -- cgit From 77e6c43e137c130138c3fbadc847351a83c4befe Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 13 Sep 2023 11:54:00 +0100 Subject: memblock: introduce MEMBLOCK_RSRV_NOINIT flag For reserved memory regions marked with this flag, reserve_bootmem_region is not called during memmap_init_reserved_pages. This can be used to avoid struct page initialization for regions which won't need them, for e.g. hugepages with Hugepage Vmemmap Optimization enabled. Link: https://lkml.kernel.org/r/20230913105401.519709-4-usama.arif@bytedance.com Signed-off-by: Usama Arif Acked-by: Muchun Song Reviewed-by: Mike Rapoport (IBM) Cc: Fam Zheng Cc: Mike Kravetz Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/memblock.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index b978cda96cf0..fd492e5bbdbc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -997,6 +997,24 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_NOMAP); } +/** + * memblock_reserved_mark_noinit - Mark a reserved memory region with flag + * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized + * for this region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * struct pages will not be initialized for reserved memory regions marked with + * %MEMBLOCK_RSRV_NOINIT. + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.reserved, base, size, 1, + MEMBLOCK_RSRV_NOINIT); +} + static bool should_skip_region(struct memblock_type *type, struct memblock_region *m, int nid, int flags) @@ -2113,13 +2131,18 @@ static void __init memmap_init_reserved_pages(void) memblock_set_node(start, end, &memblock.reserved, nid); } - /* initialize struct pages for the reserved regions */ + /* + * initialize struct pages for reserved regions that don't have + * the MEMBLOCK_RSRV_NOINIT flag set + */ for_each_reserved_mem_region(region) { - nid = memblock_get_region_node(region); - start = region->base; - end = start + region->size; + if (!memblock_is_reserved_noinit(region)) { + nid = memblock_get_region_node(region); + start = region->base; + end = start + region->size; - reserve_bootmem_region(start, end, nid); + reserve_bootmem_region(start, end, nid); + } } } -- cgit From fde1c4ecf91640e5a95ec36b71ec2e8ec379ce40 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 13 Sep 2023 11:54:01 +0100 Subject: mm: hugetlb: skip initialization of gigantic tail struct pages if freed by HVO The new boot flow when it comes to initialization of gigantic pages is as follows: - At boot time, for a gigantic page during __alloc_bootmem_hugepage, the region after the first struct page is marked as noinit. - This results in only the first struct page to be initialized in reserve_bootmem_region. As the tail struct pages are not initialized at this point, there can be a significant saving in boot time if HVO succeeds later on. - Later on in the boot, the head page is prepped and the first HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page) - 1 tail struct pages are initialized. - HVO is attempted. If it is not successful, then the rest of the tail struct pages are initialized. If it is successful, no more tail struct pages need to be initialized saving significant boot time. The WARN_ON for increased ref count in gather_bootmem_prealloc was changed to a VM_BUG_ON. This is OK as there should be no speculative references this early in boot process. The VM_BUG_ON's are there just in case such code is introduced. [akpm@linux-foundation.org: make it nicer for 80 cols] Link: https://lkml.kernel.org/r/20230913105401.519709-5-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Fam Zheng Cc: Mike Rapoport (IBM) Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb.c | 68 +++++++++++++++++++++++++++++++++++++++++++++------- mm/hugetlb_vmemmap.c | 2 +- mm/hugetlb_vmemmap.h | 9 +++---- mm/internal.h | 3 +++ mm/mm_init.c | 2 +- 5 files changed, 69 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a945efe2858a..4e276466d6aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3169,6 +3169,16 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) } found: + + /* + * Only initialize the head struct page in memmap_init_reserved_pages, + * rest of the struct pages will be initialized by the HugeTLB + * subsystem itself. + * The head struct page is used to get folio information by the HugeTLB + * subsystem like zone id and node id. + */ + memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE), + huge_page_size(h) - PAGE_SIZE); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); @@ -3176,6 +3186,43 @@ found: return 1; } +/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ +static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, + unsigned long start_page_number, + unsigned long end_page_number) +{ + enum zone_type zone = zone_idx(folio_zone(folio)); + int nid = folio_nid(folio); + unsigned long head_pfn = folio_pfn(folio); + unsigned long pfn, end_pfn = head_pfn + end_page_number; + int ret; + + for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone, nid); + prep_compound_tail((struct page *)folio, pfn - head_pfn); + ret = page_ref_freeze(page, 1); + VM_BUG_ON(!ret); + } +} + +static void __init hugetlb_folio_init_vmemmap(struct folio *folio, + struct hstate *h, + unsigned long nr_pages) +{ + int ret; + + /* Prepare folio head */ + __folio_clear_reserved(folio); + __folio_set_head(folio); + ret = page_ref_freeze(&folio->page, 1); + VM_BUG_ON(!ret); + /* Initialize the necessary tail struct pages */ + hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); + prep_compound_head((struct page *)folio, huge_page_order(h)); +} + /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_ORDER) pages. @@ -3186,19 +3233,21 @@ static void __init gather_bootmem_prealloc(void) list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); - struct folio *folio = page_folio(page); + struct folio *folio = (void *)page; struct hstate *h = m->hstate; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); - if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { - WARN_ON(folio_test_reserved(folio)); - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); - free_huge_folio(folio); /* add to the hugepage allocator */ - } else { - /* VERY unlikely inflated ref count on a tail page */ - free_gigantic_folio(folio, huge_page_order(h)); - } + + hugetlb_folio_init_vmemmap(folio, h, + HUGETLB_VMEMMAP_RESERVE_PAGES); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + /* If HVO fails, initialize all tail struct pages */ + if (!HPageVmemmapOptimized(&folio->page)) + hugetlb_folio_init_tail_vmemmap(folio, + HUGETLB_VMEMMAP_RESERVE_PAGES, + pages_per_huge_page(h)); + free_huge_folio(folio); /* add to the hugepage allocator */ /* * We need to restore the 'stolen' pages to totalram_pages @@ -3209,6 +3258,7 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } } + static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ad650c7b07ec..76682d1d79a7 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -588,7 +588,7 @@ static int __init hugetlb_vmemmap_init(void) const struct hstate *h; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ - BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE); + BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 25bd0e002431..4573899855d7 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -10,15 +10,16 @@ #define _LINUX_HUGETLB_VMEMMAP_H #include -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); - /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See * Documentation/vm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE +#define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { diff --git a/mm/internal.h b/mm/internal.h index a273f4d948d8..f7c963dfbdb3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1155,6 +1155,9 @@ struct vma_prepare { struct vm_area_struct *remove2; }; +void __meminit __init_single_page(struct page *page, unsigned long pfn, + unsigned long zone, int nid); + /* shrinker related functions */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); diff --git a/mm/mm_init.c b/mm/mm_init.c index 6be6f50813b1..077bfe393b5e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -555,7 +555,7 @@ out: node_states[N_MEMORY] = saved_node_state; } -static void __meminit __init_single_page(struct page *page, unsigned long pfn, +void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { mm_zero_struct_page(page); -- cgit From 3ec145f9d01e799bc7e41277e571141546b850f3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:23 +0100 Subject: hugetlb: use a folio in free_hpage_workfn() Patch series "Small hugetlb cleanups", v2. Some trivial folio conversions This patch (of 3): update_and_free_hugetlb_folio puts the memory on hpage_freelist as a folio so we can take it off the list as a folio. Link: https://lkml.kernel.org/r/20230824141325.2704553-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230824141325.2704553-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4e276466d6aa..7a6fe4f13777 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1787,22 +1787,22 @@ static void free_hpage_workfn(struct work_struct *work) node = llist_del_all(&hpage_freelist); while (node) { - struct page *page; + struct folio *folio; struct hstate *h; - page = container_of((struct address_space **)node, - struct page, mapping); + folio = container_of((struct address_space **)node, + struct folio, mapping); node = node->next; - page->mapping = NULL; + folio->mapping = NULL; /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will clear the hugetlb bit, so do * not use folio_hstate() directly. */ - h = size_to_hstate(page_size(page)); + h = size_to_hstate(folio_size(folio)); - __update_and_free_hugetlb_folio(h, page_folio(page)); + __update_and_free_hugetlb_folio(h, folio); cond_resched(); } -- cgit From 04bbfd844b99c7c89a344236d3758e7a2d41572f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:24 +0100 Subject: hugetlb: remove a few calls to page_folio() Anything found on a linked list threaded through ->lru is guaranteed to be a folio as the compound_head found in a tail page overlaps the ->lru member of struct page. So we can pull folios directly off these lists no matter whether pages or folios were added to the list. Link: https://lkml.kernel.org/r/20230824141325.2704553-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a6fe4f13777..e91ce966a2c9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1836,11 +1836,9 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { - struct page *page, *t_page; - struct folio *folio; + struct folio *folio, *t_folio; - list_for_each_entry_safe(page, t_page, list, lru) { - folio = page_folio(page); + list_for_each_entry_safe(folio, t_folio, list, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } @@ -2229,8 +2227,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, bool acct_surplus) { int nr_nodes, node; - struct page *page = NULL; - struct folio *folio; + struct folio *folio = NULL; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { @@ -2240,15 +2237,14 @@ static struct page *remove_pool_huge_page(struct hstate *h, */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { - page = list_entry(h->hugepage_freelists[node].next, - struct page, lru); - folio = page_folio(page); + folio = list_entry(h->hugepage_freelists[node].next, + struct folio, lru); remove_hugetlb_folio(h, folio, acct_surplus); break; } } - return page; + return &folio->page; } /* @@ -3414,15 +3410,15 @@ static void try_to_free_low(struct hstate *h, unsigned long count, * Collect pages to be freed on a list, and free after dropping lock */ for_each_node_mask(i, *nodes_allowed) { - struct page *page, *next; + struct folio *folio, *next; struct list_head *freel = &h->hugepage_freelists[i]; - list_for_each_entry_safe(page, next, freel, lru) { + list_for_each_entry_safe(folio, next, freel, lru) { if (count >= h->nr_huge_pages) goto out; - if (PageHighMem(page)) + if (folio_test_highmem(folio)) continue; - remove_hugetlb_folio(h, page_folio(page), false); - list_add(&page->lru, &page_list); + remove_hugetlb_folio(h, folio, false); + list_add(&folio->lru, &page_list); } } -- cgit From d5b43e9683ec5c78524f9ae93c1824edcbc1f08d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:25 +0100 Subject: hugetlb: convert remove_pool_huge_page() to remove_pool_hugetlb_folio() Convert the callers to expect a folio and remove the unnecesary conversion back to a struct page. Link: https://lkml.kernel.org/r/20230824141325.2704553-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e91ce966a2c9..de220e3ff8be 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1446,7 +1446,7 @@ static int hstate_next_node_to_alloc(struct hstate *h, } /* - * helper for remove_pool_huge_page() - return the previously saved + * helper for remove_pool_hugetlb_folio() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. @@ -2222,9 +2222,8 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ -static struct page *remove_pool_huge_page(struct hstate *h, - nodemask_t *nodes_allowed, - bool acct_surplus) +static struct folio *remove_pool_hugetlb_folio(struct hstate *h, + nodemask_t *nodes_allowed, bool acct_surplus) { int nr_nodes, node; struct folio *folio = NULL; @@ -2244,7 +2243,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, } } - return &folio->page; + return folio; } /* @@ -2598,7 +2597,6 @@ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; - struct page *page; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); @@ -2619,15 +2617,17 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * remove_pool_huge_page() will balance the freed pages across the + * remove_pool_hugetlb_folio() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { - page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); - if (!page) + struct folio *folio; + + folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1); + if (!folio) goto out; - list_add(&page->lru, &page_list); + list_add(&folio->lru, &page_list); } out: @@ -3472,7 +3472,6 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; - struct page *page; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); @@ -3594,11 +3593,13 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { - page = remove_pool_huge_page(h, nodes_allowed, 0); - if (!page) + struct folio *folio; + + folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); + if (!folio) break; - list_add(&page->lru, &page_list); + list_add(&folio->lru, &page_list); } /* free the pages after dropping lock */ spin_unlock_irq(&hugetlb_lock); -- cgit From affa87c708185cab194099ee51b946ef0297f063 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:37 +0000 Subject: mm/damon/core: make DAMOS uses nr_accesses_bp instead of nr_accesses Patch series "mm/damon: implement DAMOS apply intervals". DAMON-based operation schemes are applied for every aggregation interval. That is mainly because schemes are using nr_accesses, which be complete to be used for every aggregation interval. This makes some DAMOS use cases be tricky. Quota setting under long aggregation interval is one such example. Suppose the aggregation interval is ten seconds, and there is a scheme having CPU quota 100ms per 1s. The scheme will actually uses 100ms per ten seconds, since it cannobe be applied before next aggregation interval. The feature is working as intended, but the results might not that intuitive for some users. This could be fixed by updating the quota to 1s per 10s. But, in the case, the CPU usage of DAMOS could look like spikes, and actually make a bad effect to other CPU-sensitive workloads. Also, with such huge aggregation interval, users may want schemes to be applied more frequently. DAMON provides nr_accesses_bp, which is updated for each sampling interval in a way that reasonable to be used. By using that instead of nr_accesses, DAMOS can have its own time interval and mitigate abovely mentioned issues. This patchset makes DAMOS schemes to use nr_accesses_bp instead of nr_accesses, and have their own timing intervals. Also update DAMOS tried regions sysfs files and DAMOS before_apply tracepoint to use the new data as their source. Note that the interval is zero by default, and it is interpreted to use the aggregation interval instead. This avoids making user-visible behavioral changes. Patches Seuqeunce ----------------- The first patch (patch 1/9) makes DAMOS uses nr_accesses_bp instead of nr_accesses, and following two patches (patches 2/9 and 3/9) updates DAMON sysfs interface for DAMOS tried regions and the DAMOS before_apply tracespoint to use nr_accesses_bp instead of nr_accesses, respectively. The following two patches (patches 4/9 and 5/9) implements the scheme-specific apply interval for DAMON kernel API users and update the design document for the new feature. Finally, the following four patches (patches 6/9, 7/9, 8/9 and 9/9) add support of the feature in DAMON sysfs interface, add a simple selftest test case, and document the new file on the usage and the ABI documents, repsectively. This patch (of 9): DAMON provides nr_accesses_bp, which becomes same to nr_accesses * 10000 for every aggregation interval, but updated every sampling interval with a reasonable accuracy. Since DAMON-based operation schemes are applied in every aggregation interval using nr_accesses, using nr_accesses_bp instead will make no difference to users. Meanwhile, it allows DAMOS to apply the schemes in a time interval that less than the aggregation interval. It could be useful and more flexible for some cases. Do it. Link: https://lkml.kernel.org/r/20230916020945.47296-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230916020945.47296-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/damon/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index b15cf47d2d29..79fef5145a4b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -770,12 +770,13 @@ static void damon_split_region_at(struct damon_target *t, static bool __damos_valid_target(struct damon_region *r, struct damos *s) { unsigned long sz; + unsigned int nr_accesses = r->nr_accesses_bp / 10000; sz = damon_sz_region(r); return s->pattern.min_sz_region <= sz && sz <= s->pattern.max_sz_region && - s->pattern.min_nr_accesses <= r->nr_accesses && - r->nr_accesses <= s->pattern.max_nr_accesses && + s->pattern.min_nr_accesses <= nr_accesses && + nr_accesses <= s->pattern.max_nr_accesses && s->pattern.min_age_region <= r->age && r->age <= s->pattern.max_age_region; } -- cgit From e7639bb48d39dd7f544b19c8a09bb62f934b6e32 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:38 +0000 Subject: mm/damon/sysfs-schemes: use nr_accesses_bp as the source of tried_regions//nr_accesses DAMON sysfs interface exposes access rate of each region via DAMOS tried regions directory. For this, the nr_accesses field of the region is used. DAMOS was actually using nr_accesses in the past, but it uses nr_accesses_bp now. Use the value that it is really using as the source. Note that this doesn't expose nr_accesses_bp as is (in basis point), but after converting it to the natural number by dividing the value by 10,000. Hence there is no behavioral change from users' perspective. Link: https://lkml.kernel.org/r/20230916020945.47296-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 527e7d17eb3b..093700f50b18 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -31,7 +31,7 @@ static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( return NULL; sysfs_region->kobj = (struct kobject){}; sysfs_region->ar = region->ar; - sysfs_region->nr_accesses = region->nr_accesses; + sysfs_region->nr_accesses = region->nr_accesses_bp / 10000; sysfs_region->age = region->age; INIT_LIST_HEAD(&sysfs_region->list); return sysfs_region; -- cgit From 42f994b71404b17abcd6b170de7a6aa95ffe5d4a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:40 +0000 Subject: mm/damon/core: implement scheme-specific apply interval DAMON-based operation schemes are applied for every aggregation interval. That was mainly because schemes were using nr_accesses, which be complete to be used for every aggregation interval. However, the schemes are now using nr_accesses_bp, which is updated for each sampling interval in a way that reasonable to be used. Therefore, there is no reason to apply schemes for each aggregation interval. The unnecessary alignment with aggregation interval was also making some use cases of DAMOS tricky. Quotas setting under long aggregation interval is one such example. Suppose the aggregation interval is ten seconds, and there is a scheme having CPU quota 100ms per 1s. The scheme will actually uses 100ms per ten seconds, since it cannobe be applied before next aggregation interval. The feature is working as intended, but the results might not that intuitive for some users. This could be fixed by updating the quota to 1s per 10s. But, in the case, the CPU usage of DAMOS could look like spikes, and would actually make a bad effect to other CPU-sensitive workloads. Implement a dedicated timing interval for each DAMON-based operation scheme, namely apply_interval. The interval will be sampling interval aligned, and each scheme will be applied for its apply_interval. The interval is set to 0 by default, and it means the scheme should use the aggregation interval instead. This avoids old users getting any behavioral difference. Link: https://lkml.kernel.org/r/20230916020945.47296-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/damon/core.c | 72 +++++++++++++++++++++++++++++++++++++++++++----- mm/damon/dbgfs.c | 3 +- mm/damon/lru_sort.c | 2 ++ mm/damon/reclaim.c | 2 ++ mm/damon/sysfs-schemes.c | 2 +- 5 files changed, 72 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index 79fef5145a4b..5eb649bd002f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -313,7 +313,9 @@ static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) } struct damos *damon_new_scheme(struct damos_access_pattern *pattern, - enum damos_action action, struct damos_quota *quota, + enum damos_action action, + unsigned long apply_interval_us, + struct damos_quota *quota, struct damos_watermarks *wmarks) { struct damos *scheme; @@ -323,6 +325,13 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, return NULL; scheme->pattern = *pattern; scheme->action = action; + scheme->apply_interval_us = apply_interval_us; + /* + * next_apply_sis will be set when kdamond starts. While kdamond is + * running, it will also updated when it is added to the DAMON context, + * or damon_attrs are updated. + */ + scheme->next_apply_sis = 0; INIT_LIST_HEAD(&scheme->filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -335,9 +344,21 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, return scheme; } +static void damos_set_next_apply_sis(struct damos *s, struct damon_ctx *ctx) +{ + unsigned long sample_interval = ctx->attrs.sample_interval ? + ctx->attrs.sample_interval : 1; + unsigned long apply_interval = s->apply_interval_us ? + s->apply_interval_us : ctx->attrs.aggr_interval; + + s->next_apply_sis = ctx->passed_sample_intervals + + apply_interval / sample_interval; +} + void damon_add_scheme(struct damon_ctx *ctx, struct damos *s) { list_add_tail(&s->list, &ctx->schemes); + damos_set_next_apply_sis(s, ctx); } static void damon_del_scheme(struct damos *s) @@ -558,6 +579,7 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) { unsigned long sample_interval = attrs->sample_interval ? attrs->sample_interval : 1; + struct damos *s; if (attrs->min_nr_regions < 3) return -EINVAL; @@ -573,6 +595,10 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) damon_update_monitoring_results(ctx, attrs); ctx->attrs = *attrs; + + damon_for_each_scheme(s, ctx) + damos_set_next_apply_sis(s, ctx); + return 0; } @@ -1094,14 +1120,29 @@ static void kdamond_apply_schemes(struct damon_ctx *c) struct damon_target *t; struct damon_region *r, *next_r; struct damos *s; + unsigned long sample_interval = c->attrs.sample_interval ? + c->attrs.sample_interval : 1; + bool has_schemes_to_apply = false; damon_for_each_scheme(s, c) { + if (c->passed_sample_intervals != s->next_apply_sis) + continue; + + s->next_apply_sis += + (s->apply_interval_us ? s->apply_interval_us : + c->attrs.aggr_interval) / sample_interval; + if (!s->wmarks.activated) continue; + has_schemes_to_apply = true; + damos_adjust_quota(c, s); } + if (!has_schemes_to_apply) + return; + damon_for_each_target(t, c) { damon_for_each_region_safe(r, next_r, t) damon_do_apply_schemes(c, t, r); @@ -1372,11 +1413,19 @@ static void kdamond_init_intervals_sis(struct damon_ctx *ctx) { unsigned long sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; + unsigned long apply_interval; + struct damos *scheme; ctx->passed_sample_intervals = 0; ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval; ctx->next_ops_update_sis = ctx->attrs.ops_update_interval / sample_interval; + + damon_for_each_scheme(scheme, ctx) { + apply_interval = scheme->apply_interval_us ? + scheme->apply_interval_us : ctx->attrs.aggr_interval; + scheme->next_apply_sis = apply_interval / sample_interval; + } } /* @@ -1428,19 +1477,28 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - sample_interval = ctx->attrs.sample_interval ? - ctx->attrs.sample_interval : 1; if (ctx->passed_sample_intervals == next_aggregation_sis) { - ctx->next_aggregation_sis = next_aggregation_sis + - ctx->attrs.aggr_interval / sample_interval; kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) break; - if (!list_empty(&ctx->schemes)) - kdamond_apply_schemes(ctx); + } + + /* + * do kdamond_apply_schemes() after kdamond_merge_regions() if + * possible, to reduce overhead + */ + if (!list_empty(&ctx->schemes)) + kdamond_apply_schemes(ctx); + + sample_interval = ctx->attrs.sample_interval ? + ctx->attrs.sample_interval : 1; + if (ctx->passed_sample_intervals == next_aggregation_sis) { + ctx->next_aggregation_sis = next_aggregation_sis + + ctx->attrs.aggr_interval / sample_interval; + kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); if (ctx->ops.reset_aggregated) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 124f0f8c97b7..dc0ea1fc30ca 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -278,7 +278,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, goto fail; pos += parsed; - scheme = damon_new_scheme(&pattern, action, "a, &wmarks); + scheme = damon_new_scheme(&pattern, action, 0, "a, + &wmarks); if (!scheme) goto fail; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 7b8fce2f67a8..3ecdcc029443 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -158,6 +158,8 @@ static struct damos *damon_lru_sort_new_scheme( pattern, /* (de)prioritize on LRU-lists */ action, + /* for each aggregation interval */ + 0, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 648d2a85523a..ab974e477d2f 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -142,6 +142,8 @@ static struct damos *damon_reclaim_new_scheme(void) &pattern, /* page out those, as soon as found */ DAMOS_PAGEOUT, + /* for each aggregation interval */ + 0, /* under the quota. */ &damon_reclaim_quota, /* (De)activate this according to the watermarks. */ diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 093700f50b18..3d30e85596b0 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1610,7 +1610,7 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - scheme = damon_new_scheme(&pattern, sysfs_scheme->action, "a, + scheme = damon_new_scheme(&pattern, sysfs_scheme->action, 0, "a, &wmarks); if (!scheme) return NULL; -- cgit From a2a9f68e358fa9627aa72e7182ad0b82846bda9e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:42 +0000 Subject: mm/damon/sysfs-schemes: support DAMOS apply interval Update DAMON sysfs interface to support DAMOS apply intervals by adding a new file, 'apply_interval_us' in each scheme directory. Users can set and get the interval for each scheme in microseconds by writing to and reading from the file. Link: https://lkml.kernel.org/r/20230916020945.47296-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 3d30e85596b0..a7d70b95c4dd 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1121,6 +1121,7 @@ struct damon_sysfs_scheme { struct kobject kobj; enum damos_action action; struct damon_sysfs_access_pattern *access_pattern; + unsigned long apply_interval_us; struct damon_sysfs_quotas *quotas; struct damon_sysfs_watermarks *watermarks; struct damon_sysfs_scheme_filters *filters; @@ -1141,7 +1142,7 @@ static const char * const damon_sysfs_damos_action_strs[] = { }; static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( - enum damos_action action) + enum damos_action action, unsigned long apply_interval_us) { struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); @@ -1150,6 +1151,7 @@ static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( return NULL; scheme->kobj = (struct kobject){}; scheme->action = action; + scheme->apply_interval_us = apply_interval_us; return scheme; } @@ -1353,6 +1355,25 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, return -EINVAL; } +static ssize_t apply_interval_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%lu\n", scheme->apply_interval_us); +} + +static ssize_t apply_interval_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + int err = kstrtoul(buf, 0, &scheme->apply_interval_us); + + return err ? err : count; +} + static void damon_sysfs_scheme_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); @@ -1361,8 +1382,12 @@ static void damon_sysfs_scheme_release(struct kobject *kobj) static struct kobj_attribute damon_sysfs_scheme_action_attr = __ATTR_RW_MODE(action, 0600); +static struct kobj_attribute damon_sysfs_scheme_apply_interval_us_attr = + __ATTR_RW_MODE(apply_interval_us, 0600); + static struct attribute *damon_sysfs_scheme_attrs[] = { &damon_sysfs_scheme_action_attr.attr, + &damon_sysfs_scheme_apply_interval_us_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme); @@ -1413,7 +1438,11 @@ static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, schemes->schemes_arr = schemes_arr; for (i = 0; i < nr_schemes; i++) { - scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); + /* + * apply_interval_us as 0 means same to aggregation interval + * (same to before-apply_interval behavior) + */ + scheme = damon_sysfs_scheme_alloc(DAMOS_STAT, 0); if (!scheme) { damon_sysfs_schemes_rm_dirs(schemes); return -ENOMEM; @@ -1610,8 +1639,8 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - scheme = damon_new_scheme(&pattern, sysfs_scheme->action, 0, "a, - &wmarks); + scheme = damon_new_scheme(&pattern, sysfs_scheme->action, + sysfs_scheme->apply_interval_us, "a, &wmarks); if (!scheme) return NULL; @@ -1641,6 +1670,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme, scheme->pattern.max_age_region = access_pattern->age->max; scheme->action = sysfs_scheme->action; + scheme->apply_interval_us = sysfs_scheme->apply_interval_us; scheme->quota.ms = sysfs_quotas->ms; scheme->quota.sz = sysfs_quotas->sz; -- cgit From ab428b4c459e62df7dab3b1b783ea03ea06ca895 Mon Sep 17 00:00:00 2001 From: Jianguo Bao Date: Sun, 17 Sep 2023 23:04:01 +0800 Subject: mm/writeback: update filemap_dirty_folio() comment Change to use new address space operation dirty_folio(). Link: https://lkml.kernel.org/r/20230917-trycontrib1-v1-1-db22630b8839@gmail.com Fixes: 6f31a5a261db ("fs: Add aops->dirty_folio") Signed-off-by: Jianguo Bau Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b8d3d7040a50..001adbb4a180 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2679,7 +2679,7 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, * @folio: Folio to be marked as dirty. * * Filesystems which do not use buffer heads should call this function - * from their set_page_dirty address space operation. It ignores the + * from their dirty_folio address space operation. It ignores the * contents of folio_get_private(), so if the filesystem marks individual * blocks as dirty, the filesystem should handle that itself. * -- cgit From f950fa6ec6d2c058c64d364df5a460d1a0f16e96 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 18 Sep 2023 20:09:50 +0800 Subject: mm/damon/core-test: fix memory leak in damon_new_region() Patch series "mm/damon/core-test: Fix memory leaks in core-test", v3. There are a few memory leaks in core-test which are detected by kmemleak. This patchset fixes the issues. This patch (of 2): When CONFIG_DAMON_KUNIT_TEST=y and making CONFIG_DEBUG_KMEMLEAK=y and CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y, the below memory leak is detected. The damon_region which is allocated by kmem_cache_alloc() in damon_new_region() in damon_test_regions() and damon_test_update_monitoring_result() are not freed. So for damon_test_regions(), replace damon_del_region() call with damon_destroy_region() so that it calls both damon_del_region() and damon_free_region(), the latter will free the damon_region. For damon_test_update_monitoring_result(), call damon_free_region() to free it. After applying this patch, the following memory leak is never detected. unreferenced object 0xffff2b49c3edc000 (size 56): comm "kunit_try_catch", pid 338, jiffies 4294895280 (age 557.084s) hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 02 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 49 2b ff ff ............I+.. backtrace: [<0000000088e71769>] slab_post_alloc_hook+0xb8/0x368 [<00000000b528f67c>] kmem_cache_alloc+0x168/0x284 [<000000008603f022>] damon_new_region+0x28/0x54 [<00000000a3b8c64e>] damon_test_regions+0x38/0x270 [<00000000559c4801>] kunit_try_run_case+0x50/0xac [<000000003932ed49>] kunit_generic_run_threadfn_adapter+0x20/0x2c [<000000003c3e9211>] kthread+0x124/0x130 [<0000000028f85bdd>] ret_from_fork+0x10/0x20 unreferenced object 0xffff2b49c5b20000 (size 56): comm "kunit_try_catch", pid 354, jiffies 4294895304 (age 556.988s) hex dump (first 32 bytes): 03 00 00 00 00 00 00 00 07 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 96 00 00 00 49 2b ff ff ............I+.. backtrace: [<0000000088e71769>] slab_post_alloc_hook+0xb8/0x368 [<00000000b528f67c>] kmem_cache_alloc+0x168/0x284 [<000000008603f022>] damon_new_region+0x28/0x54 [<00000000ca019f80>] damon_test_update_monitoring_result+0x18/0x34 [<00000000559c4801>] kunit_try_run_case+0x50/0xac [<000000003932ed49>] kunit_generic_run_threadfn_adapter+0x20/0x2c [<000000003c3e9211>] kthread+0x124/0x130 [<0000000028f85bdd>] ret_from_fork+0x10/0x20 Link: https://lkml.kernel.org/r/20230918120951.2230468-1-ruanjinjie@huawei.com Link: https://lkml.kernel.org/r/20230918120951.2230468-2-ruanjinjie@huawei.com Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Fixes: f4c978b6594b ("mm/damon/core-test: add a test for damon_update_monitoring_results()") Signed-off-by: Jinjie Ruan Reviewed-by: SeongJae Park Cc: Brendan Higgins Cc: Feng Tang Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 79f1f12e0dd5..3959be35b901 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -30,7 +30,7 @@ static void damon_test_regions(struct kunit *test) damon_add_region(r, t); KUNIT_EXPECT_EQ(test, 1u, damon_nr_regions(t)); - damon_del_region(r, t); + damon_destroy_region(r, t); KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); damon_free_target(t); @@ -321,6 +321,8 @@ static void damon_test_update_monitoring_result(struct kunit *test) damon_update_monitoring_result(r, &old_attrs, &new_attrs); KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); KUNIT_EXPECT_EQ(test, r->age, 20); + + damon_free_region(r); } static void damon_test_set_attrs(struct kunit *test) -- cgit From a0ce79253a96ee6bbcce90775ed342ef3153c68e Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 18 Sep 2023 20:09:51 +0800 Subject: mm/damon/core-test: fix memory leak in damon_new_ctx() When CONFIG_DAMON_KUNIT_TEST=y and making CONFIG_DEBUG_KMEMLEAK=y and CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y, the below memory leak is detected. The damon_ctx which is allocated by kzalloc() in damon_new_ctx() in damon_test_ops_registration() and damon_test_set_attrs() are not freed. So use damon_destroy_ctx() to free it. After applying this patch, the following memory leak is never detected unreferenced object 0xffff2b49c6968800 (size 512): comm "kunit_try_catch", pid 350, jiffies 4294895294 (age 557.028s) hex dump (first 32 bytes): 88 13 00 00 00 00 00 00 a0 86 01 00 00 00 00 00 ................ 00 87 93 03 00 00 00 00 0a 00 00 00 00 00 00 00 ................ backtrace: [<0000000088e71769>] slab_post_alloc_hook+0xb8/0x368 [<0000000073acab3b>] __kmem_cache_alloc_node+0x174/0x290 [<00000000b5f89cef>] kmalloc_trace+0x40/0x164 [<00000000eb19e83f>] damon_new_ctx+0x28/0xb4 [<00000000daf6227b>] damon_test_ops_registration+0x34/0x328 [<00000000559c4801>] kunit_try_run_case+0x50/0xac [<000000003932ed49>] kunit_generic_run_threadfn_adapter+0x20/0x2c [<000000003c3e9211>] kthread+0x124/0x130 [<0000000028f85bdd>] ret_from_fork+0x10/0x20 unreferenced object 0xffff2b49c1a9cc00 (size 512): comm "kunit_try_catch", pid 356, jiffies 4294895306 (age 557.000s) hex dump (first 32 bytes): 88 13 00 00 00 00 00 00 a0 86 01 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 0a 00 00 00 00 00 00 00 ................ backtrace: [<0000000088e71769>] slab_post_alloc_hook+0xb8/0x368 [<0000000073acab3b>] __kmem_cache_alloc_node+0x174/0x290 [<00000000b5f89cef>] kmalloc_trace+0x40/0x164 [<00000000eb19e83f>] damon_new_ctx+0x28/0xb4 [<00000000058495c4>] damon_test_set_attrs+0x30/0x1a8 [<00000000559c4801>] kunit_try_run_case+0x50/0xac [<000000003932ed49>] kunit_generic_run_threadfn_adapter+0x20/0x2c [<000000003c3e9211>] kthread+0x124/0x130 [<0000000028f85bdd>] ret_from_fork+0x10/0x20 Link: https://lkml.kernel.org/r/20230918120951.2230468-3-ruanjinjie@huawei.com Fixes: d1836a3b2a9a ("mm/damon/core-test: initialise context before test in damon_test_set_attrs()") Fixes: 4f540f5ab4f2 ("mm/damon/core-test: add a kunit test case for ops registration") Signed-off-by: Jinjie Ruan Reviewed-by: Feng Tang Reviewed-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 3959be35b901..649adf91ebc5 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -269,6 +269,8 @@ static void damon_test_ops_registration(struct kunit *test) /* Check double-registration failure again */ KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); + + damon_destroy_ctx(c); } static void damon_test_set_regions(struct kunit *test) @@ -346,6 +348,8 @@ static void damon_test_set_attrs(struct kunit *test) invalid_attrs = valid_attrs; invalid_attrs.aggr_interval = 4999; KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); + + damon_destroy_ctx(c); } static void damon_test_moving_sum(struct kunit *test) -- cgit From 28e566572aacdc551e24649e57cc9f04ba880cd2 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Mon, 18 Sep 2023 15:33:16 +0800 Subject: mm: add functions folio_in_range() and folio_within_vma() Patch series "support large folio for mlock", v3. Yu mentioned at [1] about the mlock() can't be applied to large folio. I leant the related code and here is my understanding: - For RLIMIT_MEMLOCK related, there is no problem. Because the RLIMIT_MEMLOCK statistics is not related underneath page. That means underneath page mlock or munlock doesn't impact the RLIMIT_MEMLOCK statistics collection which is always correct. - For keeping the page in RAM, there is no problem either. At least, during try_to_unmap_one(), once detect the VMA has VM_LOCKED bit set in vm_flags, the folio will be kept whatever the folio is mlocked or not. So the function of mlock for large folio works. But it's not optimized because the page reclaim needs scan these large folio and may split them. This series identified the large folio for mlock to four types: - The large folio is in VM_LOCKED range and fully mapped to the range - The large folio is in the VM_LOCKED range but not fully mapped to the range - The large folio cross VM_LOCKED VMA boundary - The large folio cross last level page table boundary For the first type, we mlock large folio so page reclaim will skip it. For the second/third type, we don't mlock large folio. As the pages not mapped to VM_LOACKED range are mapped to none VM_LOCKED range, if system is in memory pressure situation, the large folio can be picked by page reclaim and split. Then the pages not mapped to VM_LOCKED range can be reclaimed. For the fourth type, we don't mlock large folio because locking one page table lock can't prevent the part in another last level page table being unmapped. Thanks to Ryan for pointing this out. To check whether the folio is fully mapped to the range, PTEs needs be checked to see whether the page of folio is associated. Which needs take page table lock and is heavy operation. So far, the only place needs this check is madvise and page reclaim. These functions already have their own PTE iterator. patch1 introduce API to check whether large folio is in VMA range. patch2 make page reclaim/mlock_vma_folio/munlock_vma_folio support large folio mlock/munlock. patch3 make mlock/munlock syscall support large folio. Yu also mentioned a race which can make folio unevictable after munlock during RFC v2 discussion [3]: We decided that race issue didn't block this series based on: - That race issue was not introduced by this series - We had a looks-ok fix for that race issue. Need to wait for mlock_count fixing patch as Yosry Ahmed suggested [4] [1] https://lore.kernel.org/linux-mm/CAOUHufbtNPkdktjt_5qM45GegVO-rCFOMkSh0HQminQ12zsV8Q@mail.gmail.com/ [2] https://lore.kernel.org/linux-mm/20230809061105.3369958-1-fengwei.yin@intel.com/ [3] https://lore.kernel.org/linux-mm/CAOUHufZ6=9P_=CAOQyw0xw-3q707q-1FVV09dBNDC-hpcpj2Pg@mail.gmail.com/ This patch (of 3): folio_in_range() will be used to check whether the folio is mapped to specific VMA and whether the mapping address of folio is in the range. Also a helper function folio_within_vma() to check whether folio is in the range of vma based on folio_in_range(). Link: https://lkml.kernel.org/r/20230918073318.1181104-1-fengwei.yin@intel.com Link: https://lkml.kernel.org/r/20230918073318.1181104-2-fengwei.yin@intel.com Signed-off-by: Yin Fengwei Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/internal.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index f7c963dfbdb3..14f358d068ac 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -586,6 +586,56 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); + +/* + * NOTE: This function can't tell whether the folio is "fully mapped" in the + * range. + * "fully mapped" means all the pages of folio is associated with the page + * table of range while this function just check whether the folio range is + * within the range [start, end). Funcation caller nees to do page table + * check if it cares about the page table association. + * + * Typical usage (like mlock or madvise) is: + * Caller knows at least 1 page of folio is associated with page table of VMA + * and the range [start, end) is intersect with the VMA range. Caller wants + * to know whether the folio is fully associated with the range. It calls + * this function to check whether the folio is in the range first. Then checks + * the page table to know whether the folio is fully mapped to the range. + */ +static inline bool +folio_within_range(struct folio *folio, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + pgoff_t pgoff, addr; + unsigned long vma_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + + VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio); + if (start > end) + return false; + + if (start < vma->vm_start) + start = vma->vm_start; + + if (end > vma->vm_end) + end = vma->vm_end; + + pgoff = folio_pgoff(folio); + + /* if folio start address is not in vma range */ + if (!in_range(pgoff, vma->vm_pgoff, vma_pglen)) + return false; + + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + + return !(addr < start || end - addr < folio_size(folio)); +} + +static inline bool +folio_within_vma(struct folio *folio, struct vm_area_struct *vma) +{ + return folio_within_range(folio, vma, vma->vm_start, vma->vm_end); +} + /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, -- cgit From 1acbc3f936146d1b34987294803ac131bc298ce8 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Mon, 18 Sep 2023 15:33:17 +0800 Subject: mm: handle large folio when large folio in VM_LOCKED VMA range If large folio is in the range of VM_LOCKED VMA, it should be mlocked to avoid being picked by page reclaim. Which may split the large folio and then mlock each pages again. Mlock this kind of large folio to prevent them being picked by page reclaim. For the large folio which cross the boundary of VM_LOCKED VMA or not fully mapped to VM_LOCKED VMA, we'd better not to mlock it. So if the system is under memory pressure, this kind of large folio will be split and the pages ouf of VM_LOCKED VMA can be reclaimed. Ideally, for large folio, we should mlock it when the large folio is fully mapped to VMA and munlock it if any page are unmampped from VMA. But it's not easy to detect whether the large folio is fully mapped to VMA in some cases (like add/remove rmap). So we update mlock_vma_folio() and munlock_vma_folio() to mlock/munlock the folio according to vma->vm_flags. Let caller to decide whether they should call these two functions. For add rmap, only mlock normal 4K folio and postpone large folio handling to page reclaim phase. It is possible to reuse page table iterator to detect whether folio is fully mapped or not during page reclaim phase. For remove rmap, invoke munlock_vma_folio() to munlock folio unconditionly because rmap makes folio not fully mapped to VMA. Link: https://lkml.kernel.org/r/20230918073318.1181104-3-fengwei.yin@intel.com Signed-off-by: Yin Fengwei Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/internal.h | 23 ++++++++++++--------- mm/rmap.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 68 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 14f358d068ac..9e62c8b952b8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -644,14 +644,10 @@ folio_within_vma(struct folio *folio, struct vm_area_struct *vma) * mlock is usually called at the end of page_add_*_rmap(), munlock at * the end of page_remove_rmap(); but new anon folios are managed by * folio_add_lru_vma() calling mlock_new_folio(). - * - * @compound is used to include pmd mappings of THPs, but filter out - * pte mappings of THPs, which cannot be consistently counted: a pte - * mapping of the THP head cannot be distinguished by the page alone. */ void mlock_folio(struct folio *folio); static inline void mlock_vma_folio(struct folio *folio, - struct vm_area_struct *vma, bool compound) + struct vm_area_struct *vma) { /* * The VM_SPECIAL check here serves two purposes. @@ -661,17 +657,24 @@ static inline void mlock_vma_folio(struct folio *folio, * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may * still be set while VM_SPECIAL bits are added: so ignore it then. */ - if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) && - (compound || !folio_test_large(folio))) + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED)) mlock_folio(folio); } void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, - struct vm_area_struct *vma, bool compound) + struct vm_area_struct *vma) { - if (unlikely(vma->vm_flags & VM_LOCKED) && - (compound || !folio_test_large(folio))) + /* + * munlock if the function is called. Ideally, we should only + * do munlock if any page of folio is unmapped from VMA and + * cause folio not fully mapped to VMA. + * + * But it's not easy to confirm that's the situation. So we + * always munlock the folio and page reclaim will correct it + * if it's wrong. + */ + if (unlikely(vma->vm_flags & VM_LOCKED)) munlock_folio(folio); } diff --git a/mm/rmap.c b/mm/rmap.c index d24e2c36372e..c6bb2339e35b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -798,6 +798,7 @@ struct folio_referenced_arg { unsigned long vm_flags; struct mem_cgroup *memcg; }; + /* * arg: folio_referenced_arg will be passed */ @@ -807,17 +808,33 @@ static bool folio_referenced_one(struct folio *folio, struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int referenced = 0; + unsigned long start = address, ptes = 0; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; - if ((vma->vm_flags & VM_LOCKED) && - (!folio_test_large(folio) || !pvmw.pte)) { - /* Restore the mlock which got missed */ - mlock_vma_folio(folio, vma, !pvmw.pte); - page_vma_mapped_walk_done(&pvmw); - pra->vm_flags |= VM_LOCKED; - return false; /* To break the loop */ + if (vma->vm_flags & VM_LOCKED) { + if (!folio_test_large(folio) || !pvmw.pte) { + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma); + page_vma_mapped_walk_done(&pvmw); + pra->vm_flags |= VM_LOCKED; + return false; /* To break the loop */ + } + /* + * For large folio fully mapped to VMA, will + * be handled after the pvmw loop. + * + * For large folio cross VMA boundaries, it's + * expected to be picked by page reclaim. But + * should skip reference of pages which are in + * the range of VM_LOCKED vma. As page reclaim + * should just count the reference of pages out + * the range of VM_LOCKED vma. + */ + ptes++; + pra->mapcount--; + continue; } if (pvmw.pte) { @@ -842,6 +859,23 @@ static bool folio_referenced_one(struct folio *folio, pra->mapcount--; } + if ((vma->vm_flags & VM_LOCKED) && + folio_test_large(folio) && + folio_within_vma(folio, vma)) { + unsigned long s_align, e_align; + + s_align = ALIGN_DOWN(start, PMD_SIZE); + e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE); + + /* folio doesn't cross page table boundary and fully mapped */ + if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) { + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma); + pra->vm_flags |= VM_LOCKED; + return false; /* To break the loop */ + } + } + if (referenced) folio_clear_idle(folio); if (folio_test_clear_young(folio)) @@ -1254,7 +1288,14 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, (folio_test_large(folio) && folio_entire_mapcount(folio) > 1)) && PageAnonExclusive(page), folio); - mlock_vma_folio(folio, vma, compound); + /* + * For large folio, only mlock it if it's fully mapped to VMA. It's + * not easy to check whether the large folio is fully mapped to VMA + * here. Only mlock normal 4K folio and leave page reclaim to handle + * large folio. + */ + if (!folio_test_large(folio)) + mlock_vma_folio(folio, vma); } /** @@ -1354,7 +1395,9 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page, if (nr) __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); - mlock_vma_folio(folio, vma, compound); + /* See comments in page_add_anon_rmap() */ + if (!folio_test_large(folio)) + mlock_vma_folio(folio, vma); } /** @@ -1465,7 +1508,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, * it's only reliable while mapped. */ - munlock_vma_folio(folio, vma, compound); + munlock_vma_folio(folio, vma); } /* @@ -1530,7 +1573,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Restore the mlock which got missed */ - mlock_vma_folio(folio, vma, false); + if (!folio_test_large(folio)) + mlock_vma_folio(folio, vma); page_vma_mapped_walk_done(&pvmw); ret = false; break; -- cgit From dc68badcede4ec3b4e5cdfcb8f678670220ac2ca Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Mon, 18 Sep 2023 15:33:18 +0800 Subject: mm: mlock: update mlock_pte_range to handle large folio Current kernel only lock base size folio during mlock syscall. Add large folio support with following rules: - Only mlock large folio when it's in VM_LOCKED VMA range and fully mapped to page table. fully mapped folio is required as if folio is not fully mapped to a VM_LOCKED VMA, if system is in memory pressure, page reclaim is allowed to pick up this folio, split it and reclaim the pages which are not in VM_LOCKED VMA. - munlock will apply to the large folio which is in VMA range or cross the VMA boundary. This is required to handle the case that the large folio is mlocked, later the VMA is split in the middle of large folio. Link: https://lkml.kernel.org/r/20230918073318.1181104-4-fengwei.yin@intel.com Signed-off-by: Yin Fengwei Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/mlock.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 06bdfab83b58..42b6865f8f82 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -305,6 +305,58 @@ void munlock_folio(struct folio *folio) local_unlock(&mlock_fbatch.lock); } +static inline unsigned int folio_mlock_step(struct folio *folio, + pte_t *pte, unsigned long addr, unsigned long end) +{ + unsigned int count, i, nr = folio_nr_pages(folio); + unsigned long pfn = folio_pfn(folio); + pte_t ptent = ptep_get(pte); + + if (!folio_test_large(folio)) + return 1; + + count = pfn + nr - pte_pfn(ptent); + count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT); + + for (i = 0; i < count; i++, pte++) { + pte_t entry = ptep_get(pte); + + if (!pte_present(entry)) + break; + if (pte_pfn(entry) - pfn >= nr) + break; + } + + return i; +} + +static inline bool allow_mlock_munlock(struct folio *folio, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned int step) +{ + /* + * For unlock, allow munlock large folio which is partially + * mapped to VMA. As it's possible that large folio is + * mlocked and VMA is split later. + * + * During memory pressure, such kind of large folio can + * be split. And the pages are not in VM_LOCKed VMA + * can be reclaimed. + */ + if (!(vma->vm_flags & VM_LOCKED)) + return true; + + /* folio not in range [start, end), skip mlock */ + if (!folio_within_range(folio, vma, start, end)) + return false; + + /* folio is not fully mapped, skip mlock */ + if (step != folio_nr_pages(folio)) + return false; + + return true; +} + static int mlock_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -314,6 +366,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, pte_t *start_pte, *pte; pte_t ptent; struct folio *folio; + unsigned int step = 1; + unsigned long start = addr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -334,6 +388,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, walk->action = ACTION_AGAIN; return 0; } + for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (!pte_present(ptent)) @@ -341,12 +396,19 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; - if (folio_test_large(folio)) - continue; + + step = folio_mlock_step(folio, pte, addr, end); + if (!allow_mlock_munlock(folio, vma, start, end, step)) + goto next_entry; + if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); + +next_entry: + pte += step - 1; + addr += (step - 1) << PAGE_SHIFT; } pte_unmap(start_pte); out: -- cgit From 51a23b1be92046f0cc52384d30cf060700f1a54e Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 2 Aug 2023 17:28:56 +0800 Subject: acpi,mm: fix typo sibiling -> sibling First found this typo as reviewing memory tier code. Fix it by sed like: $ sed -i 's/sibiling/sibling/g' $(git grep -l sibiling) so the acpi one will be corrected as well. Link: https://lkml.kernel.org/r/20230802092856.819328-1-lizhijian@cn.fujitsu.com Signed-off-by: Li Zhijian Cc: Aneesh Kumar K.V Cc: Huang, Ying Cc: Len Brown Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 37a4f59d9585..876d8a5e210e 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -115,7 +115,7 @@ static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memti nodemask_t nodes = NODE_MASK_NONE; struct memory_dev_type *memtype; - list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling) + list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) nodes_or(nodes, nodes, memtype->nodes); return nodes; @@ -174,7 +174,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty * If the memtype is already part of a memory tier, * just return that. */ - if (!list_empty(&memtype->tier_sibiling)) { + if (!list_empty(&memtype->tier_sibling)) { list_for_each_entry(memtier, &memory_tiers, list) { if (adistance == memtier->adistance_start) return memtier; @@ -218,7 +218,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty memtier = new_memtier; link_memtype: - list_add(&memtype->tier_sibiling, &memtier->memory_types); + list_add(&memtype->tier_sibling, &memtier->memory_types); return memtier; } @@ -527,7 +527,7 @@ static bool clear_node_memory_tier(int node) memtype = node_memory_types[node].memtype; node_clear(node, memtype->nodes); if (nodes_empty(memtype->nodes)) { - list_del_init(&memtype->tier_sibiling); + list_del_init(&memtype->tier_sibling); if (list_empty(&memtier->memory_types)) destroy_memory_tier(memtier); } @@ -553,7 +553,7 @@ struct memory_dev_type *alloc_memory_type(int adistance) return ERR_PTR(-ENOMEM); memtype->adistance = adistance; - INIT_LIST_HEAD(&memtype->tier_sibiling); + INIT_LIST_HEAD(&memtype->tier_sibling); memtype->nodes = NODE_MASK_NONE; kref_init(&memtype->kref); return memtype; -- cgit From 72a14e821cba73f74aca4bc5f768d77dece8bdb2 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 15 Sep 2023 18:58:44 +0800 Subject: memcg: expose swapcache stat for memcg v1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Expose swapcache stat for memcg v1", v2. Since commit b6038942480e ("mm: memcg: add swapcache stat for memcg v2") adds swapcache stat for the cgroup v2, it seems there is no reason to hide it in memcg v1. Conversely, with swapcached it is more accurate to evaluate the available memory for memcg. Link: https://lkml.kernel.org/r/20230915105845.3199656-1-liushixin2@huawei.com Link: https://lkml.kernel.org/r/20230915105845.3199656-2-liushixin2@huawei.com Signed-off-by: Liu Shixin Suggested-by: Yosry Ahmed Acked-by: Tejun Heo Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Michal Hocko Cc: Michal Koutný Cc: Zefan Li Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 68313331971c..7b50f214f9d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4068,7 +4068,10 @@ static const unsigned int memcg1_stats[] = { NR_WRITEBACK, WORKINGSET_REFAULT_ANON, WORKINGSET_REFAULT_FILE, +#ifdef CONFIG_SWAP MEMCG_SWAP, + NR_SWAPCACHE, +#endif }; static const char *const memcg1_stat_names[] = { @@ -4083,7 +4086,10 @@ static const char *const memcg1_stat_names[] = { "writeback", "workingset_refault_anon", "workingset_refault_file", +#ifdef CONFIG_SWAP "swap", + "swapcached", +#endif }; /* Universal VM events cgroup1 shows, original sort order */ -- cgit From 840ea53a8dec3aa5773f7957d4eaafdf925c664a Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 15 Sep 2023 18:58:45 +0800 Subject: memcg: remove unused do_memsw_account in memcg1_stat_format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit b25806dcd3d5("mm: memcontrol: deprecate swapaccounting=0 mode") do_memsw_account() is synonymous with !cgroup_subsys_on_dfl(memory_cgrp_subsys), It always equals true in memcg1_stat_format(). Remove the unused code. Link: https://lkml.kernel.org/r/20230915105845.3199656-3-liushixin2@huawei.com Signed-off-by: Liu Shixin Suggested-by: Michal Koutný Reviewed-by: Yosry Ahmed Acked-by: Tejun heo Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Zefan Li Signed-off-by: Andrew Morton --- mm/memcontrol.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7b50f214f9d6..8713bc796c77 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4113,8 +4113,6 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; - if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) - continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr * memcg_page_state_unit(memcg1_stats[i])); @@ -4137,15 +4135,12 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) } seq_buf_printf(s, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); - if (do_memsw_account()) - seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", - (u64)memsw * PAGE_SIZE); + seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", + (u64)memsw * PAGE_SIZE); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; - if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) - continue; nr = memcg_page_state(memcg, memcg1_stats[i]); seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], (u64)nr * memcg_page_state_unit(memcg1_stats[i])); -- cgit From 76a0fb4fd5c940fe4977f2fbdb08167fa16e32f6 Mon Sep 17 00:00:00 2001 From: liwenyu Date: Wed, 20 Sep 2023 17:38:49 +0800 Subject: delayacct: add memory reclaim delay in get_page_from_freelist The current memory reclaim delay statistics only count the direct memory reclaim of the task in do_try_to_free_pages(). In systems with NUMA open, some tasks occasionally experience slower response times, but the total count of reclaim does not increase, using ftrace can show that node_reclaim has occurred. The memory reclaim occurring in get_page_from_freelist() is also due to heavy memory load. To get the impact of tasks in memory reclaim, this patch adds the statistics of the memory reclaim delay statistics for __node_reclaim(). Link: https://lkml.kernel.org/r/181C946095F0252B+7cc60eca-1abf-4502-aad3-ffd8ef89d910@ex.bilibili.com Signed-off-by: Wen Yu Li Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 3df0e2a59052..55424215f759 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7326,6 +7326,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in cond_resched(); psi_memstall_enter(&pflags); + delayacct_freepages_start(); fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP @@ -7348,6 +7349,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); psi_memstall_leave(&pflags); + delayacct_freepages_end(); trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); -- cgit From d98388cef5315d0235fdcad26600102f54966927 Mon Sep 17 00:00:00 2001 From: Minjie Du Date: Thu, 21 Sep 2023 16:15:35 +0800 Subject: mm/filemap: increase usage of folio_next_index() helper Simplify code pattern of 'folio->index + folio_nr_pages(folio)' by using the existing helper folio_next_index() in filemap_map_pages(). Link: https://lkml.kernel.org/r/20230921081535.3398-1-duminjie@vivo.com Signed-off-by: Minjie Du Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Yin Fengwei Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index f0a15ce1bd1b..9481ffaf24e6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3591,7 +3591,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - end = folio->index + folio_nr_pages(folio) - 1; + end = folio_next_index(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; if (!folio_test_large(folio)) -- cgit From 65610453459f9048678a0daef89d592e412ec00a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:12 +0800 Subject: mm: memory: add vm_normal_folio_pmd() Patch series "mm: convert numa balancing functions to use a folio", v2. do_numa_pages() only handles non-compound pages, and only PMD-mapped THPs are handled in do_huge_pmd_numa_page(). But a large, PTE-mapped folio will be supported so let's convert more numa balancing functions to use/take a folio in preparation for that, no functional change intended for now. This patch (of 6): The new vm_normal_folio_pmd() wrapper is similar to vm_normal_folio(), which allow them to completely replace the struct page variables with struct folio variables. Link: https://lkml.kernel.org/r/20230921074417.24004-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20230921074417.24004-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index d956b231e835..311e862c6404 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -689,6 +689,16 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, out: return pfn_to_page(pfn); } + +struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + struct page *page = vm_normal_page_pmd(vma, addr, pmd); + + if (page) + return page_folio(page); + return NULL; +} #endif static void restore_exclusive_pte(struct vm_area_struct *vma, -- cgit From 667ffc31aa95e7023707924b08415523208bce9d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:13 +0800 Subject: mm: huge_memory: use a folio in do_huge_pmd_numa_page() Use a folio in do_huge_pmd_numa_page(), reduce three page_folio() calls to one, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5c20e43782e4..5baf9b6dc522 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1517,9 +1517,9 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; pmd_t oldpmd = vmf->orig_pmd; pmd_t pmd; - struct page *page; + struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int page_nid = NUMA_NO_NODE; + int nid = NUMA_NO_NODE; int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); bool migrated = false, writable = false; int flags = 0; @@ -1541,36 +1541,34 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) can_change_pmd_writable(vma, vmf->address, pmd)) writable = true; - page = vm_normal_page_pmd(vma, haddr, pmd); - if (!page) + folio = vm_normal_folio_pmd(vma, haddr, pmd); + if (!folio) goto out_map; /* See similar comment in do_numa_page for explanation */ if (!writable) flags |= TNF_NO_GROUP; - page_nid = page_to_nid(page); + nid = folio_nid(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. */ - if (node_is_toptier(page_nid)) - last_cpupid = page_cpupid_last(page); - target_nid = numa_migrate_prep(page, vma, haddr, page_nid, - &flags); - + if (node_is_toptier(nid)) + last_cpupid = page_cpupid_last(&folio->page); + target_nid = numa_migrate_prep(&folio->page, vma, haddr, nid, &flags); if (target_nid == NUMA_NO_NODE) { - put_page(page); + folio_put(folio); goto out_map; } spin_unlock(vmf->ptl); writable = false; - migrated = migrate_misplaced_folio(page_folio(page), vma, target_nid); + migrated = migrate_misplaced_folio(folio, vma, target_nid); if (migrated) { flags |= TNF_MIGRATED; - page_nid = target_nid; + nid = target_nid; } else { flags |= TNF_MIGRATE_FAIL; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1582,9 +1580,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } out: - if (page_nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, - flags); + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); return 0; -- cgit From 6695cf68b15c215d33b8add64c33e01e3cbe236c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:14 +0800 Subject: mm: memory: use a folio in do_numa_page() Numa balancing only try to migrate non-compound page in do_numa_page(), use a folio in it to save several compound_head calls, note we use folio_estimated_sharers(), it is enough to check the folio sharers since only normal page is handled, if large folio numa balancing is supported, a precise folio sharers check would be used, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 311e862c6404..865741d9b6b9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4744,8 +4744,8 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL; - int page_nid = NUMA_NO_NODE; + struct folio *folio = NULL; + int nid = NUMA_NO_NODE; bool writable = false; int last_cpupid; int target_nid; @@ -4776,12 +4776,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) can_change_pte_writable(vma, vmf->address, pte)) writable = true; - page = vm_normal_page(vma, vmf->address, pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, vmf->address, pte); + if (!folio || folio_is_zone_device(folio)) goto out_map; /* TODO: handle PTE-mapped THP */ - if (PageCompound(page)) + if (folio_test_large(folio)) goto out_map; /* @@ -4796,34 +4796,34 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_NO_GROUP; /* - * Flag if the page is shared between multiple address spaces. This + * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; - page_nid = page_to_nid(page); + nid = folio_nid(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. */ if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && - !node_is_toptier(page_nid)) + !node_is_toptier(nid)) last_cpupid = (-1 & LAST_CPUPID_MASK); else - last_cpupid = page_cpupid_last(page); - target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, - &flags); + last_cpupid = page_cpupid_last(&folio->page); + target_nid = numa_migrate_prep(&folio->page, vma, vmf->address, nid, + &flags); if (target_nid == NUMA_NO_NODE) { - put_page(page); + folio_put(folio); goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; /* Migrate to the requested node */ - if (migrate_misplaced_folio(page_folio(page), vma, target_nid)) { - page_nid = target_nid; + if (migrate_misplaced_folio(folio, vma, target_nid)) { + nid = target_nid; flags |= TNF_MIGRATED; } else { flags |= TNF_MIGRATE_FAIL; @@ -4839,8 +4839,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) } out: - if (page_nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, page_nid, 1, flags); + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, 1, flags); return 0; out_map: /* -- cgit From cda6d93672ac5dd8af778a3f3e6082e12233b65b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:15 +0800 Subject: mm: memory: make numa_migrate_prep() to take a folio In preparation for large folio numa balancing, make numa_migrate_prep() to take a folio, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/internal.h | 2 +- mm/memory.c | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5baf9b6dc522..aa0224556132 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1556,7 +1556,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) */ if (node_is_toptier(nid)) last_cpupid = page_cpupid_last(&folio->page); - target_nid = numa_migrate_prep(&folio->page, vma, haddr, nid, &flags); + target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; diff --git a/mm/internal.h b/mm/internal.h index 9e62c8b952b8..5a5c923725d3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -983,7 +983,7 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); -int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); void free_zone_device_page(struct page *page); diff --git a/mm/memory.c b/mm/memory.c index 865741d9b6b9..20b290c9dc87 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4724,10 +4724,10 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) { - get_page(page); + folio_get(folio); /* Record the current PID acceesing VMA */ vma_set_access_pid_bit(vma); @@ -4738,7 +4738,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, *flags |= TNF_FAULT_LOCAL; } - return mpol_misplaced(page, vma, addr); + return mpol_misplaced(&folio->page, vma, addr); } static vm_fault_t do_numa_page(struct vm_fault *vmf) @@ -4812,8 +4812,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) last_cpupid = (-1 & LAST_CPUPID_MASK); else last_cpupid = page_cpupid_last(&folio->page); - target_nid = numa_migrate_prep(&folio->page, vma, vmf->address, nid, - &flags); + target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; -- cgit From 75c70128a67311070115b90d826a229d4bbbb2b5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:16 +0800 Subject: mm: mempolicy: make mpol_misplaced() to take a folio In preparation for large folio numa balancing, make mpol_misplaced() to take a folio, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- mm/mempolicy.c | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 20b290c9dc87..7ca16ed67634 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4738,7 +4738,7 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, *flags |= TNF_FAULT_LOCAL; } - return mpol_misplaced(&folio->page, vma, addr); + return mpol_misplaced(folio, vma, addr); } static vm_fault_t do_numa_page(struct vm_fault *vmf) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f1b00d6ac7ee..69c0eac7292c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2564,24 +2564,25 @@ static void sp_free(struct sp_node *n) } /** - * mpol_misplaced - check whether current page node is valid in policy + * mpol_misplaced - check whether current folio node is valid in policy * - * @page: page to be checked - * @vma: vm area where page mapped - * @addr: virtual address where page mapped + * @folio: folio to be checked + * @vma: vm area where folio mapped + * @addr: virtual address in @vma for shared policy lookup and interleave policy * - * Lookup current policy node id for vma,addr and "compare to" page's + * Lookup current policy node id for vma,addr and "compare to" folio's * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * * Return: NUMA_NO_NODE if the page is in a node that is valid for this - * policy, or a suitable node ID to allocate a replacement page from. + * policy, or a suitable node ID to allocate a replacement folio from. */ -int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr) { struct mempolicy *pol; struct zoneref *z; - int curnid = page_to_nid(page); + int curnid = folio_nid(folio); unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); @@ -2637,11 +2638,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long BUG(); } - /* Migrate the page towards the node whose CPU is referencing it */ + /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; - if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) + if (!should_numa_migrate_memory(current, &folio->page, curnid, + thiscpu)) goto out; } -- cgit From 8c9ae56dc73b5ae48a14000b96292bd4f2aeb710 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:17 +0800 Subject: sched/numa, mm: make numa migrate functions to take a folio The cpupid (or access time) is stored in the head page for THP, so it is safely to make should_numa_migrate_memory() and numa_hint_fault_latency() to take a folio. This is in preparation for large folio numa balancing. Link: https://lkml.kernel.org/r/20230921074417.24004-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 69c0eac7292c..abd94f4c7f6b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2642,7 +2642,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, if (pol->flags & MPOL_F_MORON) { polnid = thisnid; - if (!should_numa_migrate_memory(current, &folio->page, curnid, + if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } -- cgit From 987ffa5a3858bee448dc791cf6f596790aea52a8 Mon Sep 17 00:00:00 2001 From: Huan Yang Date: Wed, 20 Sep 2023 09:57:27 +0800 Subject: mm/damon/core: remove unnecessary si_meminfo invoke. si_meminfo() will read and assign more info not just free/ram pages. For just DAMOS_WMARK_FREE_MEM_RATE use, only get free and ram pages is ok to save cpu. Link: https://lkml.kernel.org/r/20230920015727.4482-1-link@vivo.com Signed-off-by: Huan Yang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index 5eb649bd002f..9f4f7c378cf3 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1326,12 +1326,10 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric) { - struct sysinfo i; - switch (metric) { case DAMOS_WMARK_FREE_MEM_RATE: - si_meminfo(&i); - return i.freeram * 1000 / i.totalram; + return global_zone_page_state(NR_FREE_PAGES) * 1000 / + totalram_pages(); default: break; } -- cgit From a08c7193e4f18dc8508f2d07d0de2c5b94cb39a3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Sep 2023 12:20:17 -0700 Subject: mm/filemap: remove hugetlb special casing in filemap.c Remove special cased hugetlb handling code within the page cache by changing the granularity of ->index to the base page size rather than the huge page size. The motivation of this patch is to reduce complexity within the filemap code while also increasing performance by removing branches that are evaluated on every page cache lookup. To support the change in index, new wrappers for hugetlb page cache interactions are added. These wrappers perform the conversion to a linear index which is now expected by the page cache for huge pages. ========================= PERFORMANCE ====================================== Perf was used to check the performance differences after the patch. Overall the performance is similar to mainline with a very small larger overhead that occurs in __filemap_add_folio() and hugetlb_add_to_page_cache(). This is because of the larger overhead that occurs in xa_load() and xa_store() as the xarray is now using more entries to store hugetlb folios in the page cache. Timing aarch64 2MB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt real 1m49.568s user 0m0.000s sys 1m49.461s 6.5-rc3: [root]# time fallocate -l 700GB test.txt real 1m47.495s user 0m0.000s sys 1m47.370s 1GB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt real 1m47.024s user 0m0.000s sys 1m46.921s 6.5-rc3: [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt real 1m44.551s user 0m0.000s sys 1m44.438s x86 2MB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt real 0m22.383s user 0m0.000s sys 0m22.255s 6.5-rc3: [opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt real 0m22.735s user 0m0.038s sys 0m22.567s 1GB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt real 0m25.786s user 0m0.001s sys 0m25.589s 6.5-rc3: [root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt real 0m33.454s user 0m0.001s sys 0m33.193s aarch64: workload - fallocate a 700GB file backed by huge pages 6.5-rc3 + this patch: 2MB Page Size: --100.00%--__arm64_sys_fallocate ksys_fallocate vfs_fallocate hugetlbfs_fallocate | |--95.04%--__pi_clear_page | |--3.57%--clear_huge_page | | | |--2.63%--rcu_all_qs | | | --0.91%--__cond_resched | --0.67%--__cond_resched 0.17% 0.00% 0 fallocate [kernel.vmlinux] [k] hugetlb_add_to_page_cache 0.14% 0.10% 11 fallocate [kernel.vmlinux] [k] __filemap_add_folio 6.5-rc3 2MB Page Size: --100.00%--__arm64_sys_fallocate ksys_fallocate vfs_fallocate hugetlbfs_fallocate | |--94.91%--__pi_clear_page | |--4.11%--clear_huge_page | | | |--3.00%--rcu_all_qs | | | --1.10%--__cond_resched | --0.59%--__cond_resched 0.08% 0.01% 1 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache 0.05% 0.03% 3 fallocate [kernel.kallsyms] [k] __filemap_add_folio x86 workload - fallocate a 100GB file backed by huge pages 6.5-rc3 + this patch: 2MB Page Size: hugetlbfs_fallocate | --99.57%--clear_huge_page | --98.47%--clear_page_erms | --0.53%--asm_sysvec_apic_timer_interrupt 0.04% 0.04% 1 fallocate [kernel.kallsyms] [k] xa_load 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] __filemap_add_folio 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] xas_store 6.5-rc3 2MB Page Size: --99.93%--__x64_sys_fallocate vfs_fallocate hugetlbfs_fallocate | --99.38%--clear_huge_page | |--98.40%--clear_page_erms | --0.59%--__cond_resched 0.03% 0.03% 1 fallocate [kernel.kallsyms] [k] __filemap_add_folio ========================= TESTING ====================================== This patch passes libhugetlbfs tests and LTP hugetlb tests ********** TEST SUMMARY * 2M * 32-bit 64-bit * Total testcases: 110 113 * Skipped: 0 0 * PASS: 107 113 * FAIL: 0 0 * Killed by signal: 3 0 * Bad configuration: 0 0 * Expected FAIL: 0 0 * Unexpected PASS: 0 0 * Test not present: 0 0 * Strange test result: 0 0 ********** Done executing testcases. LTP Version: 20220527-178-g2761a81c4 page migration was also tested using Mike Kravetz's test program.[8] [dan.carpenter@linaro.org: fix an NULL vs IS_ERR() bug] Link: https://lkml.kernel.org/r/1772c296-1417-486f-8eef-171af2192681@moroto.mountain Link: https://lkml.kernel.org/r/20230926192017.98183-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Signed-off-by: Dan Carpenter Reported-and-tested-by: syzbot+c225dea486da4d5592bd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c225dea486da4d5592bd Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/filemap.c | 34 ++++++++++------------------------ mm/hugetlb.c | 32 ++++++-------------------------- mm/migrate.c | 6 +++--- 3 files changed, 19 insertions(+), 53 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 9481ffaf24e6..340c693112a9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -131,11 +131,8 @@ static void page_cache_delete(struct address_space *mapping, mapping_set_update(&xas, mapping); - /* hugetlb pages are represented by a single entry in the xarray */ - if (!folio_test_hugetlb(folio)) { - xas_set_order(&xas, folio->index, folio_order(folio)); - nr = folio_nr_pages(folio); - } + xas_set_order(&xas, folio->index, folio_order(folio)); + nr = folio_nr_pages(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -234,7 +231,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio) if (free_folio) free_folio(folio); - if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + if (folio_test_large(folio)) refs = folio_nr_pages(folio); folio_put_refs(folio, refs); } @@ -855,14 +852,15 @@ noinline int __filemap_add_folio(struct address_space *mapping, if (!huge) { int error = mem_cgroup_charge(folio, NULL, gfp); - VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); if (error) return error; charged = true; - xas_set_order(&xas, index, folio_order(folio)); - nr = folio_nr_pages(folio); } + VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); + xas_set_order(&xas, index, folio_order(folio)); + nr = folio_nr_pages(folio); + gfp &= GFP_RECLAIM_MASK; folio_ref_add(folio, nr); folio->mapping = mapping; @@ -2040,7 +2038,7 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; - if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + if (!xa_is_value(folio)) nr = folio_nr_pages(folio); *start = indices[idx] + nr; } @@ -2104,7 +2102,7 @@ put: int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; - if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + if (!xa_is_value(folio)) nr = folio_nr_pages(folio); *start = indices[idx] + nr; } @@ -2145,9 +2143,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } @@ -2213,9 +2208,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, if (!folio_batch_add(fbatch, folio)) { nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } @@ -2232,10 +2224,7 @@ update_start: if (nr) { folio = fbatch->folios[nr - 1]; - if (folio_test_hugetlb(folio)) - *start = folio->index + 1; - else - *start = folio_next_index(folio); + *start = folio->index + folio_nr_pages(folio); } out: rcu_read_unlock(); @@ -2273,9 +2262,6 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de220e3ff8be..0aaf28ce32e5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -952,7 +952,7 @@ static long region_count(struct resv_map *resv, long f, long t) /* * Convert the address within this vma to the page offset within - * the mapping, in pagecache page units; huge pages here. + * the mapping, huge page units here. */ static pgoff_t vma_hugecache_offset(struct hstate *h, struct vm_area_struct *vma, unsigned long address) @@ -961,13 +961,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -pgoff_t linear_hugepage_index(struct vm_area_struct *vma, - unsigned long address) -{ - return vma_hugecache_offset(hstate_vma(vma), vma, address); -} -EXPORT_SYMBOL_GPL(linear_hugepage_index); - /** * vma_kernel_pagesize - Page size granularity for this VMA. * @vma: The user mapping. @@ -2074,20 +2067,6 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) return NULL; } -pgoff_t hugetlb_basepage_index(struct page *page) -{ - struct page *page_head = compound_head(page); - pgoff_t index = page_index(page_head); - unsigned long compound_idx; - - if (compound_order(page_head) > MAX_ORDER) - compound_idx = page_to_pfn(page) - page_to_pfn(page_head); - else - compound_idx = page - page_head; - - return (index << compound_order(page_head)) + compound_idx; -} - static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) @@ -5772,7 +5751,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; - pgoff_t idx = vma_hugecache_offset(h, vma, address); + pgoff_t idx = linear_page_index(vma, address); struct folio *folio; folio = filemap_get_folio(mapping, idx); @@ -5789,6 +5768,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping struct hstate *h = hstate_inode(inode); int err; + idx <<= huge_page_order(h); __folio_set_locked(folio); err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); @@ -5896,7 +5876,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * before we get page_table_lock. */ new_folio = false; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -6205,7 +6185,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_folio = filemap_lock_folio(mapping, idx); + pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } @@ -6338,7 +6318,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (is_continue) { ret = -EFAULT; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) goto out; folio_in_pagecache = true; diff --git a/mm/migrate.c b/mm/migrate.c index 7d1804c4a5d9..942f8c9cd068 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -524,7 +524,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; xas_lock_irq(&xas); - expected_count = 2 + folio_has_private(src); + expected_count = folio_expected_refs(mapping, src); if (!folio_ref_freeze(src, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; @@ -533,11 +533,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, dst->index = src->index; dst->mapping = src->mapping; - folio_get(dst); + folio_ref_add(dst, folio_nr_pages(dst)); xas_store(&xas, dst); - folio_ref_unfreeze(src, expected_count - 1); + folio_ref_unfreeze(src, expected_count - folio_nr_pages(src)); xas_unlock_irq(&xas); -- cgit From a48bf7b4757cd8de3497c2878536f46a8d2da65c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Sep 2023 10:44:33 -0700 Subject: mm/hugetlb: replace page_ref_freeze() with folio_ref_freeze() in hugetlb_folio_init_vmemmap() No functional difference, folio_ref_freeze() is currently a wrapper for page_ref_freeze(). Link: https://lkml.kernel.org/r/20230926174433.81241-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Muchun Song Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0aaf28ce32e5..36b40bc9ac25 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3191,7 +3191,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, /* Prepare folio head */ __folio_clear_reserved(folio); __folio_set_head(folio); - ret = page_ref_freeze(&folio->page, 1); + ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); /* Initialize the necessary tail struct pages */ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); -- cgit From 07a8bdd4120ced3490ef9adf51b8086af0aaa8e7 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 26 Sep 2023 14:06:25 +0800 Subject: memory tiering: add abstract distance calculation algorithms management Patch series "memory tiering: calculate abstract distance based on ACPI HMAT", v4. We have the explicit memory tiers framework to manage systems with multiple types of memory, e.g., DRAM in DIMM slots and CXL memory devices. Where, same kind of memory devices will be grouped into memory types, then put into memory tiers. To describe the performance of a memory type, abstract distance is defined. Which is in direct proportion to the memory latency and inversely proportional to the memory bandwidth. To keep the code as simple as possible, fixed abstract distance is used in dax/kmem to describe slow memory such as Optane DCPMM. To support more memory types, in this series, we added the abstract distance calculation algorithm management mechanism, provided a algorithm implementation based on ACPI HMAT, and used the general abstract distance calculation interface in dax/kmem driver. So, dax/kmem can support HBM (high bandwidth memory) in addition to the original Optane DCPMM. This patch (of 4): The abstract distance may be calculated by various drivers, such as ACPI HMAT, CXL CDAT, etc. While it may be used by various code which hot-add memory node, such as dax/kmem etc. To decouple the algorithm users and the providers, the abstract distance calculation algorithms management mechanism is implemented in this patch. It provides interface for the providers to register the implementation, and interface for the users. Multiple algorithm implementations can cooperate via calculating abstract distance for different memory nodes. The preference of algorithm implementations can be specified via priority (notifier_block.priority). Link: https://lkml.kernel.org/r/20230926060628.265989-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20230926060628.265989-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Bharata B Rao Reviewed-by: Alistair Popple Reviewed-by: Dave Jiang Cc: Aneesh Kumar K.V Cc: Wei Xu Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Yang Shi Cc: Rafael J Wysocki Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'mm') diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 876d8a5e210e..4301e7e89223 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "internal.h" @@ -105,6 +106,8 @@ static int top_tier_adistance; static struct demotion_nodes *node_demotion __read_mostly; #endif /* CONFIG_MIGRATION */ +static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); + static inline struct memory_tier *to_memory_tier(struct device *device) { return container_of(device, struct memory_tier, dev); @@ -592,6 +595,62 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) } EXPORT_SYMBOL_GPL(clear_node_memory_type); +/** + * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm + * @nb: The notifier block which describe the algorithm + * + * Return: 0 on success, errno on error. + * + * Every memory tiering abstract distance algorithm provider needs to + * register the algorithm with register_mt_adistance_algorithm(). To + * calculate the abstract distance for a specified memory node, the + * notifier function will be called unless some high priority + * algorithm has provided result. The prototype of the notifier + * function is as follows, + * + * int (*algorithm_notifier)(struct notifier_block *nb, + * unsigned long nid, void *data); + * + * Where "nid" specifies the memory node, "data" is the pointer to the + * returned abstract distance (that is, "int *adist"). If the + * algorithm provides the result, NOTIFY_STOP should be returned. + * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next + * algorithm in the chain to provide the result. + */ +int register_mt_adistance_algorithm(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); +} +EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); + +/** + * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm + * @nb: the notifier block which describe the algorithm + * + * Return: 0 on success, errno on error. + */ +int unregister_mt_adistance_algorithm(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); +} +EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); + +/** + * mt_calc_adistance() - Calculate abstract distance with registered algorithms + * @node: the node to calculate abstract distance for + * @adist: the returned abstract distance + * + * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some + * abstract distance algorithm provides the result, and return it via + * @adist. Otherwise, no algorithm can provide the result and @adist + * will be kept as it is. + */ +int mt_calc_adistance(int node, int *adist) +{ + return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); +} +EXPORT_SYMBOL_GPL(mt_calc_adistance); + static int __meminit memtier_hotplug_callback(struct notifier_block *self, unsigned long action, void *_arg) { -- cgit From 3718c02dbd4c88d47b5af003acdb3d1112604ea3 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 26 Sep 2023 14:06:27 +0800 Subject: acpi, hmat: calculate abstract distance with HMAT A memory tiering abstract distance calculation algorithm based on ACPI HMAT is implemented. The basic idea is as follows. The performance attributes of system default DRAM nodes are recorded as the base line. Whose abstract distance is MEMTIER_ADISTANCE_DRAM. Then, the ratio of the abstract distance of a memory node (target) to MEMTIER_ADISTANCE_DRAM is scaled based on the ratio of the performance attributes of the node to that of the default DRAM nodes. The functions to record the read/write latency/bandwidth of the default DRAM nodes and calculate abstract distance according to read/write latency/bandwidth ratio will be used by CXL CDAT (Coherent Device Attribute Table) and other memory device drivers. So, they are put in memory-tiers.c. Link: https://lkml.kernel.org/r/20230926060628.265989-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Bharata B Rao Reviewed-by: Dave Jiang Reviewed-by: Alistair Popple Cc: Aneesh Kumar K.V Cc: Wei Xu Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Yang Shi Cc: Rafael J Wysocki Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 4301e7e89223..085321c77123 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -37,7 +37,7 @@ struct node_memory_type_map { static DEFINE_MUTEX(memory_tier_lock); static LIST_HEAD(memory_tiers); static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; -static struct memory_dev_type *default_dram_type; +struct memory_dev_type *default_dram_type; static struct bus_type memory_tier_subsys = { .name = "memory_tiering", @@ -108,6 +108,11 @@ static struct demotion_nodes *node_demotion __read_mostly; static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); +static bool default_dram_perf_error; +static struct node_hmem_attrs default_dram_perf; +static int default_dram_perf_ref_nid = NUMA_NO_NODE; +static const char *default_dram_perf_ref_source; + static inline struct memory_tier *to_memory_tier(struct device *device) { return container_of(device, struct memory_tier, dev); @@ -595,6 +600,102 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) } EXPORT_SYMBOL_GPL(clear_node_memory_type); +static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix) +{ + pr_info( +"%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", + prefix, attrs->read_latency, attrs->write_latency, + attrs->read_bandwidth, attrs->write_bandwidth); +} + +int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, + const char *source) +{ + int rc = 0; + + mutex_lock(&memory_tier_lock); + if (default_dram_perf_error) { + rc = -EIO; + goto out; + } + + if (perf->read_latency + perf->write_latency == 0 || + perf->read_bandwidth + perf->write_bandwidth == 0) { + rc = -EINVAL; + goto out; + } + + if (default_dram_perf_ref_nid == NUMA_NO_NODE) { + default_dram_perf = *perf; + default_dram_perf_ref_nid = nid; + default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); + goto out; + } + + /* + * The performance of all default DRAM nodes is expected to be + * same (that is, the variation is less than 10%). And it + * will be used as base to calculate the abstract distance of + * other memory nodes. + */ + if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > + default_dram_perf.read_latency || + abs(perf->write_latency - default_dram_perf.write_latency) * 10 > + default_dram_perf.write_latency || + abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > + default_dram_perf.read_bandwidth || + abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > + default_dram_perf.write_bandwidth) { + pr_info( +"memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" +"DRAM node %d.\n", nid, default_dram_perf_ref_nid); + pr_info(" performance of reference DRAM node %d:\n", + default_dram_perf_ref_nid); + dump_hmem_attrs(&default_dram_perf, " "); + pr_info(" performance of DRAM node %d:\n", nid); + dump_hmem_attrs(perf, " "); + pr_info( +" disable default DRAM node performance based abstract distance algorithm.\n"); + default_dram_perf_error = true; + rc = -EINVAL; + } + +out: + mutex_unlock(&memory_tier_lock); + return rc; +} + +int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist) +{ + if (default_dram_perf_error) + return -EIO; + + if (default_dram_perf_ref_nid == NUMA_NO_NODE) + return -ENOENT; + + if (perf->read_latency + perf->write_latency == 0 || + perf->read_bandwidth + perf->write_bandwidth == 0) + return -EINVAL; + + mutex_lock(&memory_tier_lock); + /* + * The abstract distance of a memory node is in direct proportion to + * its memory latency (read + write) and inversely proportional to its + * memory bandwidth (read + write). The abstract distance, memory + * latency, and memory bandwidth of the default DRAM nodes are used as + * the base. + */ + *adist = MEMTIER_ADISTANCE_DRAM * + (perf->read_latency + perf->write_latency) / + (default_dram_perf.read_latency + default_dram_perf.write_latency) * + (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / + (perf->read_bandwidth + perf->write_bandwidth); + mutex_unlock(&memory_tier_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(mt_perf_to_adistance); + /** * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm * @nb: The notifier block which describe the algorithm -- cgit From 6bc2cfdf82d56863b7cf5e86e37a662b2ae5d47e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 26 Sep 2023 14:06:28 +0800 Subject: dax, kmem: calculate abstract distance with general interface Previously, a fixed abstract distance MEMTIER_DEFAULT_DAX_ADISTANCE is used for slow memory type in kmem driver. This limits the usage of kmem driver, for example, it cannot be used for HBM (high bandwidth memory). So, we use the general abstract distance calculation mechanism in kmem drivers to get more accurate abstract distance on systems with proper support. The original MEMTIER_DEFAULT_DAX_ADISTANCE is used as fallback only. Now, multiple memory types may be managed by kmem. These memory types are put into the "kmem_memory_types" list and protected by kmem_memory_type_lock. Link: https://lkml.kernel.org/r/20230926060628.265989-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Bharata B Rao Reviewed-by: Dave Jiang Reviewed-by: Alistair Popple Cc: Aneesh Kumar K.V Cc: Wei Xu Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Yang Shi Cc: Rafael J Wysocki Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 085321c77123..8d5291add2bc 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -586,13 +586,14 @@ EXPORT_SYMBOL_GPL(init_node_memory_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype) { mutex_lock(&memory_tier_lock); - if (node_memory_types[node].memtype == memtype) + if (node_memory_types[node].memtype == memtype || !memtype) node_memory_types[node].map_count--; /* * If we umapped all the attached devices to this node, * clear the node memory type. */ if (!node_memory_types[node].map_count) { + memtype = node_memory_types[node].memtype; node_memory_types[node].memtype = NULL; put_memory_type(memtype); } -- cgit From 5e924ff54d088828794d9f1a4d5bf17808f7270e Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 25 Sep 2023 21:09:36 -0700 Subject: mm/ksm: add "smart" page scanning mode Patch series "Smart scanning mode for KSM", v3. This patch series adds "smart scanning" for KSM. What is smart scanning? ======================= KSM evaluates all the candidate pages for each scan. It does not use historic information from previous scans. This has the effect that candidate pages that couldn't be used for KSM de-duplication continue to be evaluated for each scan. The idea of "smart scanning" is to keep historic information. With the historic information we can temporarily skip the candidate page for one or several scans. Details: ======== "Smart scanning" is to keep two small counters to store if the page has been used for KSM. One counter stores how often we already tried to use the page for KSM and the other counter stores how often we skip a page. How often we skip the candidate page depends how often a page failed KSM de-duplication. The code skips a maximum of 8 times. During testing this has shown to be a good compromise for different workloads. New sysfs knob: =============== Smart scanning is not enabled by default. With /sys/kernel/mm/ksm/smart_scan smart scanning can be enabled. Monitoring: =========== To monitor how effective smart scanning is a new sysfs knob has been introduced. /sys/kernel/mm/pages_skipped report how many pages have been skipped by smart scanning. Results: ======== - Various workloads have shown a 20% - 25% reduction in page scans For the instagram workload for instance, the number of pages scanned has been reduced from over 20M pages per scan to less than 15M pages. - Less pages scans also resulted in an overall higher de-duplication rate as some shorter lived pages could be de-duplicated additionally - Less pages scanned allows to reduce the pages_to_scan parameter and this resulted in a 25% reduction in terms of CPU. - The improvements have been observed for workloads that enable KSM with madvise as well as prctl This patch (of 4): This change adds a "smart" page scanning mode for KSM. So far all the candidate pages are continuously scanned to find candidates for de-duplication. There are a considerably number of pages that cannot be de-duplicated. This is costly in terms of CPU. By using smart scanning considerable CPU savings can be achieved. This change takes the history of scanning pages into account and skips the page scanning of certain pages for a while if de-deduplication for this page has not been successful in the past. To do this it introduces two new fields in the ksm_rmap_item structure: age and remaining_skips. age, is the KSM age and remaining_skips determines how often scanning of this page is skipped. The age field is incremented each time the page is scanned and the page cannot be de- duplicated. age updated is capped at U8_MAX. How often a page is skipped is dependent how often de-duplication has been tried so far and the number of skips is currently limited to 8. This value has shown to be effective with different workloads. The feature is currently disable by default and can be enabled with the new smart_scan knob. The feature has shown to be very effective: upt to 25% of the page scans can be eliminated; the pages_to_scan rate can be reduced by 40 - 50% and a similar de-duplication rate can be maintained. [akpm@linux-foundation.org: make ksm_smart_scan default true, for testing] Link: https://lkml.kernel.org/r/20230926040939.516161-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20230926040939.516161-2-shr@devkernel.io Signed-off-by: Stefan Roesch Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Rik van Riel Cc: Stefan Roesch Signed-off-by: Andrew Morton --- mm/ksm.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 981af9c72e7a..8ad66b1c91b9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -56,6 +56,8 @@ #define DO_NUMA(x) do { } while (0) #endif +typedef u8 rmap_age_t; + /** * DOC: Overview * @@ -193,6 +195,8 @@ struct ksm_stable_node { * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node + * @age: number of scan iterations since creation + * @remaining_skips: how many scans to skip */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; @@ -205,6 +209,8 @@ struct ksm_rmap_item { struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ + rmap_age_t age; + rmap_age_t remaining_skips; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ @@ -281,6 +287,10 @@ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; +/* Skip pages that couldn't be de-duplicated previously */ +/* Default to true at least temporarily, for testing */ +static bool ksm_smart_scan = true; + /* The number of zero pages which is placed by KSM */ unsigned long ksm_zero_pages; @@ -2305,6 +2315,73 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, return rmap_item; } +/* + * Calculate skip age for the ksm page age. The age determines how often + * de-duplicating has already been tried unsuccessfully. If the age is + * smaller, the scanning of this page is skipped for less scans. + * + * @age: rmap_item age of page + */ +static unsigned int skip_age(rmap_age_t age) +{ + if (age <= 3) + return 1; + if (age <= 5) + return 2; + if (age <= 8) + return 4; + + return 8; +} + +/* + * Determines if a page should be skipped for the current scan. + * + * @page: page to check + * @rmap_item: associated rmap_item of page + */ +static bool should_skip_rmap_item(struct page *page, + struct ksm_rmap_item *rmap_item) +{ + rmap_age_t age; + + if (!ksm_smart_scan) + return false; + + /* + * Never skip pages that are already KSM; pages cmp_and_merge_page() + * will essentially ignore them, but we still have to process them + * properly. + */ + if (PageKsm(page)) + return false; + + age = rmap_item->age; + if (age != U8_MAX) + rmap_item->age++; + + /* + * Smaller ages are not skipped, they need to get a chance to go + * through the different phases of the KSM merging. + */ + if (age < 3) + return false; + + /* + * Are we still allowed to skip? If not, then don't skip it + * and determine how much more often we are allowed to skip next. + */ + if (!rmap_item->remaining_skips) { + rmap_item->remaining_skips = skip_age(age); + return false; + } + + /* Skip this page */ + rmap_item->remaining_skips--; + remove_rmap_item_from_tree(rmap_item); + return true; +} + static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; @@ -2409,6 +2486,10 @@ next_mm: if (rmap_item) { ksm_scan.rmap_list = &rmap_item->rmap_list; + + if (should_skip_rmap_item(*page, rmap_item)) + goto next_page; + ksm_scan.address += PAGE_SIZE; } else put_page(*page); @@ -3449,6 +3530,28 @@ static ssize_t full_scans_show(struct kobject *kobj, } KSM_ATTR_RO(full_scans); +static ssize_t smart_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", ksm_smart_scan); +} + +static ssize_t smart_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return -EINVAL; + + ksm_smart_scan = value; + return count; +} +KSM_ATTR(smart_scan); + static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, @@ -3469,6 +3572,7 @@ static struct attribute *ksm_attrs[] = { &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, &general_profit_attr.attr, + &smart_scan_attr.attr, NULL, }; -- cgit From e5a68991268906a6989f1bf273580c2218662ad6 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 25 Sep 2023 21:09:37 -0700 Subject: mm/ksm: add pages_skipped metric This change adds the "pages skipped" metric. To be able to evaluate how successful smart page scanning is, the pages skipped metric can be compared to the pages scanned metric. The pages skipped metric is a cumulative counter. The counter is stored under /sys/kernel/mm/ksm/pages_skipped. Link: https://lkml.kernel.org/r/20230926040939.516161-3-shr@devkernel.io Signed-off-by: Stefan Roesch Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/ksm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 8ad66b1c91b9..7efcc68ccc6e 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -294,6 +294,9 @@ static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ unsigned long ksm_zero_pages; +/* The number of pages that have been skipped due to "smart scanning" */ +static unsigned long ksm_pages_skipped; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -2377,6 +2380,7 @@ static bool should_skip_rmap_item(struct page *page, } /* Skip this page */ + ksm_pages_skipped++; rmap_item->remaining_skips--; remove_rmap_item_from_tree(rmap_item); return true; @@ -3464,6 +3468,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t pages_skipped_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_pages_skipped); +} +KSM_ATTR_RO(pages_skipped); + static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3561,6 +3572,7 @@ static struct attribute *ksm_attrs[] = { &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, + &pages_skipped_attr.attr, &ksm_zero_pages_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA -- cgit From 30a89adf872d2e46323840964c95dc0ae3bb5843 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 16 Oct 2023 19:55:49 -0700 Subject: hugetlb: check for hugetlb folio before vmemmap_restore In commit d8f5f7e445f0 ("hugetlb: set hugetlb page flag before optimizing vmemmap") checks were added to print a warning if hugetlb_vmemmap_restore was called on a non-hugetlb page. This was mostly due to ordering issues in the hugetlb page set up and tear down sequencees. One place missed was the routine dissolve_free_huge_page. Naoya Horiguchi noted: "I saw that VM_WARN_ON_ONCE() in hugetlb_vmemmap_restore is triggered when memory_failure() is called on a free hugetlb page with vmemmap optimization disabled (the warning is not triggered if vmemmap optimization is enabled). I think that we need check folio_test_hugetlb() before dissolve_free_huge_page() calls hugetlb_vmemmap_restore_folio()." Perform the check as suggested by Naoya. Link: https://lkml.kernel.org/r/20231017032140.GA3680@monkey Fixes: d8f5f7e445f0 ("hugetlb: set hugetlb page flag before optimizing vmemmap") Signed-off-by: Mike Kravetz Suggested-by: Naoya Horiguchi Tested-by: Naoya Horiguchi Cc: Anshuman Khandual Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Joao Martins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3fb3a5232173..dfb4534834f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2322,17 +2322,23 @@ retry: * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. + * + * The folio_test_hugetlb check here is because + * remove_hugetlb_folio will clear hugetlb folio flag for + * non-vmemmap optimized hugetlb folios. */ - rc = hugetlb_vmemmap_restore(h, &folio->page); - if (!rc) { - update_and_free_hugetlb_folio(h, folio, false); - } else { - spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); - h->max_huge_pages++; - spin_unlock_irq(&hugetlb_lock); - } + if (folio_test_hugetlb(folio)) { + rc = hugetlb_vmemmap_restore(h, &folio->page); + if (rc) { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_folio(h, folio, false); + h->max_huge_pages++; + goto out; + } + } else + rc = 0; + update_and_free_hugetlb_folio(h, folio, false); return rc; } out: -- cgit From ff841a06c844b0556b434d67cfc43f4fda56ae7b Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 22 Sep 2023 17:57:39 +0000 Subject: mm: memcg: refactor page state unit helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: memcg: fix tracking of pending stats updates values", v2. While working on adjacent code [1], I realized that the values passed into memcg_rstat_updated() to keep track of the magnitude of pending updates is consistent. It is mostly in pages, but sometimes it can be in bytes or KBs. Fix that. Patch 1 reworks memcg_page_state_unit() so that we can reuse it in patch 2 to check and normalize the units of state updates. [1]https://lore.kernel.org/lkml/20230921081057.3440885-1-yosryahmed@google.com/ This patch (of 2): memcg_page_state_unit() is currently used to identify the unit of a memcg state item so that all stats in memory.stat are in bytes. However, it lies about the units of WORKINGSET_* stats. These stats actually represent pages, but we present them to userspace as a scalar number of events. In retrospect, maybe those stats should have been memcg "events" rather than memcg "state". In preparation for using memcg_page_state_unit() for other purposes that need to know the truthful units of different stat items, break it down into two helpers: - memcg_page_state_unit() retuns the actual unit of the item. - memcg_page_state_output_unit() returns the unit used for output. Use the latter instead of the former in memcg_page_state_output() and lruvec_page_state_output(). While we are at it, let's show cgroup v1 some love and add memcg_page_state_local_output() for consistency. No functional change intended. Link: https://lkml.kernel.org/r/20230922175741.635002-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20230922175741.635002-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8713bc796c77..7e65d8c2e685 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1535,7 +1535,7 @@ static const struct memory_stat memory_stats[] = { { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, }; -/* Translate stat items to the correct unit for memory.stat output */ +/* The actual unit of the state item, not the same as the output unit */ static int memcg_page_state_unit(int item) { switch (item) { @@ -1543,6 +1543,22 @@ static int memcg_page_state_unit(int item) case MEMCG_ZSWAP_B: case NR_SLAB_RECLAIMABLE_B: case NR_SLAB_UNRECLAIMABLE_B: + return 1; + case NR_KERNEL_STACK_KB: + return SZ_1K; + default: + return PAGE_SIZE; + } +} + +/* Translate stat items to the correct unit for memory.stat output */ +static int memcg_page_state_output_unit(int item) +{ + /* + * Workingset state is actually in pages, but we export it to userspace + * as a scalar count of events, so special case it here. + */ + switch (item) { case WORKINGSET_REFAULT_ANON: case WORKINGSET_REFAULT_FILE: case WORKINGSET_ACTIVATE_ANON: @@ -1551,17 +1567,23 @@ static int memcg_page_state_unit(int item) case WORKINGSET_RESTORE_FILE: case WORKINGSET_NODERECLAIM: return 1; - case NR_KERNEL_STACK_KB: - return SZ_1K; default: - return PAGE_SIZE; + return memcg_page_state_unit(item); } } static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item) { - return memcg_page_state(memcg, item) * memcg_page_state_unit(item); + return memcg_page_state(memcg, item) * + memcg_page_state_output_unit(item); +} + +static inline unsigned long memcg_page_state_local_output( + struct mem_cgroup *memcg, int item) +{ + return memcg_page_state_local(memcg, item) * + memcg_page_state_output_unit(item); } static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) @@ -4113,9 +4135,8 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; - nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], - nr * memcg_page_state_unit(memcg1_stats[i])); + nr = memcg_page_state_local_output(memcg, memcg1_stats[i]); + seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -4141,9 +4162,9 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; - nr = memcg_page_state(memcg, memcg1_stats[i]); + nr = memcg_page_state_output(memcg, memcg1_stats[i]); seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], - (u64)nr * memcg_page_state_unit(memcg1_stats[i])); + (u64)nr); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -6625,7 +6646,8 @@ static int memory_stat_show(struct seq_file *m, void *v) static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, int item) { - return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); + return lruvec_page_state(lruvec, item) * + memcg_page_state_output_unit(item); } static int memory_numa_stat_show(struct seq_file *m, void *v) -- cgit From 7bd5bc3ce9632aefd0eed33a19212a2e55c0f873 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 22 Sep 2023 17:57:40 +0000 Subject: mm: memcg: normalize the value passed into memcg_rstat_updated() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit memcg_rstat_updated() uses the value of the state update to keep track of the magnitude of pending updates, so that we only do a stats flush when it's worth the work. Most values passed into memcg_rstat_updated() are in pages, however, a few of them are actually in bytes or KBs. To put this into perspective, a 512 byte slab allocation today would look the same as allocating 512 pages. This may result in premature flushes, which means unnecessary work and latency. Normalize all the state values passed into memcg_rstat_updated() to pages. Round up non-zero sub-page to 1 page, because memcg_rstat_updated() ignores 0 page updates. Link: https://lkml.kernel.org/r/20230922175741.635002-3-yosryahmed@google.com Fixes: 5b3be698a872 ("memcg: better bounds on the memcg stats updates") Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e65d8c2e685..8539f2037168 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -763,6 +763,22 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) return x; } +static int memcg_page_state_unit(int item); + +/* + * Normalize the value passed into memcg_rstat_updated() to be in pages. Round + * up non-zero sub-page updates to 1 page as zero page updates are ignored. + */ +static int memcg_state_val_in_pages(int idx, int val) +{ + int unit = memcg_page_state_unit(idx); + + if (!val || unit == PAGE_SIZE) + return val; + else + return max(val * unit / PAGE_SIZE, 1UL); +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -775,7 +791,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -826,7 +842,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); memcg_stats_unlock(); } -- cgit From d61ea1cb009532dcbd77a9d44b812704cec60146 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 21 Aug 2023 19:15:13 +0500 Subject: userfaultfd: UFFD_FEATURE_WP_ASYNC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Implement IOCTL to get and optionally clear info about PTEs", v33. *Motivation* The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows GetWriteWatch() and ResetWriteWatch() syscalls [1]. The GetWriteWatch() retrieves the addresses of the pages that are written to in a region of virtual memory. This syscall is used in Windows applications and games etc. This syscall is being emulated in pretty slow manner in userspace. Our purpose is to enhance the kernel such that we translate it efficiently in a better way. Currently some out of tree hack patches are being used to efficiently emulate it in some kernels. We intend to replace those with these patches. So the whole gaming on Linux can effectively get benefit from this. It means there would be tons of users of this code. CRIU use case [2] was mentioned by Andrei and Danylo: > Use cases for migrating sparse VMAs are binaries sanitized with ASAN, > MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of > shadow memory [4]. Being able to migrate such binaries allows to highly > reduce the amount of work needed to identify and fix post-migration > crashes, which happen constantly. Andrei defines the following uses of this code: * it is more granular and allows us to track changed pages more effectively. The current interface can clear dirty bits for the entire process only. In addition, reading info about pages is a separate operation. It means we must freeze the process to read information about all its pages, reset dirty bits, only then we can start dumping pages. The information about pages becomes more and more outdated, while we are processing pages. The new interface solves both these downsides. First, it allows us to read pte bits and clear the soft-dirty bit atomically. It means that CRIU will not need to freeze processes to pre-dump their memory. Second, it clears soft-dirty bits for a specified region of memory. It means CRIU will have actual info about pages to the moment of dumping them. * The new interface has to be much faster because basic page filtering is happening in the kernel. With the old interface, we have to read pagemap for each page. *Implementation Evolution (Short Summary)* From the definition of GetWriteWatch(), we feel like kernel's soft-dirty feature can be used under the hood with some additions like: * reset soft-dirty flag for only a specific region of memory instead of clearing the flag for the entire process * get and clear soft-dirty flag for a specific region atomically So we decided to use ioctl on pagemap file to read or/and reset soft-dirty flag. But using soft-dirty flag, sometimes we get extra pages which weren't even written. They had become soft-dirty because of VMA merging and VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were able to by-pass this short coming by ignoring VM_SOFTDIRTY until David reported that mprotect etc messes up the soft-dirty flag while ignoring VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We discussed if we can revert these patches. But we could not reach to any conclusion. So at this point, I made couple of tries to solve this whole VM_SOFTDIRTY issue by correcting the soft-dirty implementation: * [7] Correct the bug fixed wrongly back in 2014. It had potential to cause regression. We left it behind. * [8] Keep a list of soft-dirty part of a VMA across splits and merges. I got the reply don't increase the size of the VMA by 8 bytes. At this point, we left soft-dirty considering it is too much delicate and userfaultfd [9] seemed like the only way forward. From there onward, we have been basing soft-dirty emulation on userfaultfd wp feature where kernel resolves the faults itself when WP_ASYNC feature is used. It was straight forward to add WP_ASYNC feature in userfautlfd. Now we get only those pages dirty or written-to which are really written in reality. (PS There is another WP_UNPOPULATED userfautfd feature is required which is needed to avoid pre-faulting memory before write-protecting [9].) All the different masks were added on the request of CRIU devs to create interface more generic and better. [1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-getwritewatch [2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com [3] https://github.com/google/sanitizers [4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit [5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com [6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/ [7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.com [8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.com [9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com [10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com This patch (of 6): Add a new userfaultfd-wp feature UFFD_FEATURE_WP_ASYNC, that allows userfaultfd wr-protect faults to be resolved by the kernel directly. It can be used like a high accuracy version of soft-dirty, without vma modifications during tracking, and also with ranged support by default rather than for a whole mm when reset the protections due to existence of ioctl(UFFDIO_WRITEPROTECT). Several goals of such a dirty tracking interface: 1. All types of memory should be supported and tracable. This is nature for soft-dirty but should mention when the context is userfaultfd, because it used to only support anon/shmem/hugetlb. The problem is for a dirty tracking purpose these three types may not be enough, and it's legal to track anything e.g. any page cache writes from mmap. 2. Protections can be applied to partial of a memory range, without vma split/merge fuss. The hope is that the tracking itself should not affect any vma layout change. It also helps when reset happens because the reset will not need mmap write lock which can block the tracee. 3. Accuracy needs to be maintained. This means we need pte markers to work on any type of VMA. One could question that, the whole concept of async dirty tracking is not really close to fundamentally what userfaultfd used to be: it's not "a fault to be serviced by userspace" anymore. However, using userfaultfd-wp here as a framework is convenient for us in at least: 1. VM_UFFD_WP vma flag, which has a very good name to suite something like this, so we don't need VM_YET_ANOTHER_SOFT_DIRTY. Just use a new feature bit to identify from a sync version of uffd-wp registration. 2. PTE markers logic can be leveraged across the whole kernel to maintain the uffd-wp bit as long as an arch supports, this also applies to this case where uffd-wp bit will be a hint to dirty information and it will not go lost easily (e.g. when some page cache ptes got zapped). 3. Reuse ioctl(UFFDIO_WRITEPROTECT) interface for either starting or resetting a range of memory, while there's no counterpart in the old soft-dirty world, hence if this is wanted in a new design we'll need a new interface otherwise. We can somehow understand that commonality because uffd-wp was fundamentally a similar idea of write-protecting pages just like soft-dirty. This implementation allows WP_ASYNC to imply WP_UNPOPULATED, because so far WP_ASYNC seems to not usable if without WP_UNPOPULATE. This also gives us chance to modify impl of WP_ASYNC just in case it could be not depending on WP_UNPOPULATED anymore in the future kernels. It's also fine to imply that because both features will rely on PTE_MARKER_UFFD_WP config option, so they'll show up together (or both missing) in an UFFDIO_API probe. vma_can_userfault() now allows any VMA if the userfaultfd registration is only about async uffd-wp. So we can track dirty for all kinds of memory including generic file systems (like XFS, EXT4 or BTRFS). One trick worth mention in do_wp_page() is that we need to manually update vmf->orig_pte here because it can be used later with a pte_same() check - this path always has FAULT_FLAG_ORIG_PTE_VALID set in the flags. The major defect of this approach of dirty tracking is we need to populate the pgtables when tracking starts. Soft-dirty doesn't do it like that. It's unwanted in the case where the range of memory to track is huge and unpopulated (e.g., tracking updates on a 10G file with mmap() on top, without having any page cache installed yet). One way to improve this is to allow pte markers exist for larger than PTE level for PMD+. That will not change the interface if to implemented, so we can leave that for later. Link: https://lkml.kernel.org/r/20230821141518.870589-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20230821141518.870589-2-usama.anjum@collabora.com Signed-off-by: Peter Xu Co-developed-by: Muhammad Usama Anjum Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Cc: Michał Mirosław Signed-off-by: Andrew Morton --- mm/hugetlb.c | 32 +++++++++++++++++++------------- mm/memory.c | 28 +++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dfb4534834f5..bc654b36df9f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6247,21 +6247,27 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Handle userfault-wp first, before trying to lock more pages */ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = address, - .flags = flags, - }; + if (!userfaultfd_wp_async(vma)) { + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = address, + .flags = flags, + }; - spin_unlock(ptl); - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); + spin_unlock(ptl); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); + } + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return handle_userfault(&vmf, VM_UFFD_WP); } - hugetlb_vma_unlock_read(vma); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - return handle_userfault(&vmf, VM_UFFD_WP); + + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(mm, haddr, ptep, entry); + /* Fallthrough to CoW */ } /* diff --git a/mm/memory.c b/mm/memory.c index 2d209bc4d18c..d0f61d729fb4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1,3 +1,4 @@ + // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c @@ -3349,11 +3350,28 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; + pte_t pte; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return handle_userfault(vmf, VM_UFFD_WP); + if (!userfaultfd_wp_async(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_WP); + } + + /* + * Nothing needed (cache flush, TLB invalidations, + * etc.) because we're only removing the uffd-wp bit, + * which is completely invisible to the user. + */ + pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); + + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + /* + * Update this to be prepared for following up CoW + * handling + */ + vmf->orig_pte = pte; } /* @@ -4879,8 +4897,11 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vma)) { if (likely(!unshare) && - userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) + userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { + if (userfaultfd_wp_async(vmf->vma)) + goto split; return handle_userfault(vmf, VM_UFFD_WP); + } return do_huge_pmd_wp_page(vmf); } @@ -4892,6 +4913,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) } } +split: /* COW or write-notify handled on pte level: split pmd. */ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); -- cgit From 52526ca7fdb905a768a93f8faa418e9b988fc34b Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 21 Aug 2023 19:15:14 +0500 Subject: fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally clear the info about page table entries. The following operations are supported in this IOCTL: - Scan the address range and get the memory ranges matching the provided criteria. This is performed when the output buffer is specified. - Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING`` can be used with or without PM_SCAN_CHECK_WPASYNC. - Both of those operations can be combined into one atomic operation where we can get and write protect the pages as well. Following flags about pages are currently supported: - PAGE_IS_WPALLOWED - Page has async-write-protection enabled - PAGE_IS_WRITTEN - Page has been written to from the time it was write protected - PAGE_IS_FILE - Page is file backed - PAGE_IS_PRESENT - Page is present in the memory - PAGE_IS_SWAPPED - Page is in swapped - PAGE_IS_PFNZERO - Page has zero PFN - PAGE_IS_HUGE - Page is THP or Hugetlb backed This IOCTL can be extended to get information about more PTE bits. The entire address range passed by user [start, end) is scanned until either the user provided buffer is full or max_pages have been found. [akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"] [akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning] [arnd@arndb.de: hide unused pagemap_scan_backout_range() function] Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org [sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"] Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Michał Mirosław Signed-off-by: Arnd Bergmann Signed-off-by: Stephen Rothwell Reviewed-by: Andrei Vagin Reviewed-by: Michał Mirosław Cc: Alex Sierra Cc: Al Viro Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Signed-off-by: Andrew Morton --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc654b36df9f..2878e0e6bac5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5044,7 +5044,7 @@ bool is_hugetlb_entry_migration(pte_t pte) return false; } -static bool is_hugetlb_entry_hwpoisoned(pte_t pte) +bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; @@ -6266,7 +6266,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(mm, haddr, ptep, entry); + set_huge_pte_at(mm, haddr, ptep, entry, + huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } -- cgit From 6facf36ee49675ea952713a7a1488e9f191e7eec Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 1 Oct 2023 00:10:29 +0100 Subject: mm/filemap: clarify filemap_fault() comments for not uptodate case The existing comments in filemap_fault() suggest that, after either a minor fault has occurred and filemap_get_folio() found a folio in the page cache, or a major fault arose and __filemap_get_folio(FGP_CREATE...) did the job (having relied on do_sync_mmap_readahead() or filemap_read_folio() to read in the folio), the only possible reason it could not be uptodate is because of an error. This is not so, as if, for instance, the fault occurred within a VMA which had the VM_RAND_READ flag set (via madvise() with the MADV_RANDOM flag specified), this would cause even synchronous readahead to fail to read in the folio. I confirmed this by dropping page caches and faulting in memory madvise()'d this way, observing that this code path was reached on each occasion. Clarify the comments to include this case, and additionally update the comment recently added around the invalidate lock logic to make it clear the comment explicitly refers to the minor fault case. In addition, while we're here, refer to folios rather than pages. [lstoakes@gmail.com: correct identation as per Christopher's feedback] Link: https://lkml.kernel.org/r/2c7014c0-6343-4e76-8697-3f84f54350bd@lucifer.local Link: https://lkml.kernel.org/r/20230930231029.88196-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 340c693112a9..062c25b184a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3307,21 +3307,28 @@ retry_find: VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* - * We have a locked page in the page cache, now we need to check - * that it's up-to-date. If not, it is going to be due to an error. + * We have a locked folio in the page cache, now we need to check + * that it's up-to-date. If not, it is going to be due to an error, + * or because readahead was otherwise unable to retrieve it. */ if (unlikely(!folio_test_uptodate(folio))) { /* - * The page was in cache and uptodate and now it is not. - * Strange but possible since we didn't hold the page lock all - * the time. Let's drop everything get the invalidate lock and - * try again. + * If the invalidate lock is not held, the folio was in cache + * and uptodate and now it is not. Strange but possible since we + * didn't hold the page lock all the time. Let's drop + * everything, get the invalidate lock and try again. */ if (!mapping_locked) { folio_unlock(folio); folio_put(folio); goto retry_find; } + + /* + * OK, the folio is really not uptodate. This can be because the + * VMA has the VM_RAND_READ flag set, or because an error + * arose. Let's read it in directly. + */ goto page_not_uptodate; } -- cgit From e3e1a5067fd2f1b3f4f7c651f5b33082962d1aa1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:26:53 -0700 Subject: shmem: remove vma arg from shmem_get_folio_gfp() The vma is already there in vmf->vma, so no need for a separate arg. Link: https://lkml.kernel.org/r/d9ce6f65-a2ed-48f4-4299-fdb0544875c5@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 8539d39ec9a3..dfa7e544a99d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1921,14 +1921,13 @@ unlock: * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vma, vmf, and fault_type are only supplied by shmem_fault: - * otherwise they are NULL. + * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) + struct vm_fault *vmf, vm_fault_t *fault_type) { + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; @@ -2141,7 +2140,7 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, foliop, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); + mapping_gfp_mask(inode->i_mapping), NULL, NULL); } /* @@ -2225,7 +2224,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) } err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, - gfp, vma, vmf, &ret); + gfp, vmf, &ret); if (err) return vmf_error(err); if (folio) @@ -4893,7 +4892,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, BUG_ON(!shmem_mapping(mapping)); error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, - gfp, NULL, NULL, NULL); + gfp, NULL, NULL); if (error) return ERR_PTR(error); -- cgit From f0a9ad1d4d9ba3c694bca91d8d67be9a4a33b902 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:27:53 -0700 Subject: shmem: factor shmem_falloc_wait() out of shmem_fault() That Trinity livelock shmem_falloc avoidance block is unlikely, and a distraction from the proper business of shmem_fault(): separate it out. (This used to help compilers save stack on the fault path too, but both gcc and clang nowadays seem to make better choices anyway.) Link: https://lkml.kernel.org/r/6fe379a4-6176-9225-9263-fe60d2633c0@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 126 +++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 69 insertions(+), 57 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index dfa7e544a99d..0aa8889c76f2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2148,87 +2148,99 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, * entry unconditionally - even if something else had already woken the * target. */ -static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->entry); return ret; } +/* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_rwsem. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_rwsem in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ +static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) +{ + struct shmem_falloc *shmem_falloc; + struct file *fpin = NULL; + vm_fault_t ret = 0; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); + + ret = VM_FAULT_NOPAGE; + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + } + spin_unlock(&inode->i_lock); + if (fpin) { + fput(fpin); + ret = VM_FAULT_RETRY; + } + return ret; +} + static vm_fault_t shmem_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = vmf->vma; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; + vm_fault_t ret = 0; int err; - vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can - * prevent the hole-punch from ever completing: which in turn - * locks writers out with its hold on i_rwsem. So refrain from - * faulting pages into the hole while it's being punched. Although - * shmem_undo_range() does remove the additions, it may be unable to - * keep up, as each new page needs its own unmap_mapping_range() call, - * and the i_mmap tree grows ever slower to scan if new vmas are added. - * - * It does not matter if we sometimes reach this check just before the - * hole-punch begins, so that one fault then races with the punch: - * we just need to make racing faults a rare case. - * - * The implementation below would be much simpler if we just used a - * standard mutex or completion: but we cannot take i_rwsem in fault, - * and bloating every shmem inode for this unlikely case would be sad. + * prevent the hole-punch from ever completing: noted in i_private. */ if (unlikely(inode->i_private)) { - struct shmem_falloc *shmem_falloc; - - spin_lock(&inode->i_lock); - shmem_falloc = inode->i_private; - if (shmem_falloc && - shmem_falloc->waitq && - vmf->pgoff >= shmem_falloc->start && - vmf->pgoff < shmem_falloc->next) { - struct file *fpin; - wait_queue_head_t *shmem_falloc_waitq; - DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); - - ret = VM_FAULT_NOPAGE; - fpin = maybe_unlock_mmap_for_io(vmf, NULL); - if (fpin) - ret = VM_FAULT_RETRY; - - shmem_falloc_waitq = shmem_falloc->waitq; - prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); - schedule(); - - /* - * shmem_falloc_waitq points into the shmem_fallocate() - * stack of the hole-punching task: shmem_falloc_waitq - * is usually invalid by the time we reach here, but - * finish_wait() does not dereference it in that case; - * though i_lock needed lest racing with wake_up_all(). - */ - spin_lock(&inode->i_lock); - finish_wait(shmem_falloc_waitq, &shmem_fault_wait); - spin_unlock(&inode->i_lock); - - if (fpin) - fput(fpin); + ret = shmem_falloc_wait(vmf, inode); + if (ret) return ret; - } - spin_unlock(&inode->i_lock); } + WARN_ON_ONCE(vmf->page != NULL); err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); - if (folio) + if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); + ret |= VM_FAULT_LOCKED; + } return ret; } -- cgit From 9be7d5b06648b808989e99c5d0bea1be47c5a384 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:28:50 -0700 Subject: shmem: trivial tidyups, removing extra blank lines, etc Mostly removing a few superfluous blank lines, joining short arglines, imposing some 80-column observance, correcting a couple of comments. None of it more interesting than deleting a repeated INIT_LIST_HEAD(). Link: https://lkml.kernel.org/r/b3983d28-5d3f-8649-36af-b819285d7a9e@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 56 +++++++++++++++++++++----------------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 0aa8889c76f2..3540fc598848 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -756,7 +756,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * Like filemap_add_folio, but error if expected item has gone. + * Somewhat like filemap_add_folio, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, @@ -825,7 +825,7 @@ error: } /* - * Like delete_from_page_cache, but substitutes swap for @folio. + * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { @@ -887,7 +887,6 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, cond_resched_rcu(); } } - rcu_read_unlock(); return swapped << PAGE_SHIFT; @@ -1213,7 +1212,6 @@ static int shmem_setattr(struct mnt_idmap *idmap, if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { error = dquot_transfer(idmap, inode, attr); - if (error) return error; } @@ -2456,7 +2454,6 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, if (err) return ERR_PTR(err); - inode = new_inode(sb); if (!inode) { shmem_free_inode(sb, 0); @@ -2481,11 +2478,10 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, shmem_set_inode_flags(inode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); - INIT_LIST_HEAD(&info->swaplist); - if (sbinfo->noswap) - mapping_set_unevictable(inode->i_mapping); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); + if (sbinfo->noswap) + mapping_set_unevictable(inode->i_mapping); mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { @@ -2697,7 +2693,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, } ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); - if (ret) return ret; @@ -3229,8 +3224,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, error = simple_acl_create(dir, inode); if (error) goto out_iput; - error = security_inode_init_security(inode, dir, - &dentry->d_name, + error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; @@ -3259,14 +3253,11 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); - if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; } - - error = security_inode_init_security(inode, dir, - NULL, + error = security_inode_init_security(inode, dir, NULL, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; @@ -3303,7 +3294,8 @@ static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, /* * Link a file.. */ -static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +static int shmem_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int ret = 0; @@ -3334,7 +3326,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ - dget(dentry); /* Extra pinning count for the created dentry */ + dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); out: return ret; @@ -3354,7 +3346,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) inode_set_ctime_current(inode)); inode_inc_iversion(dir); drop_nlink(inode); - dput(dentry); /* Undo the count from "create" - this does all the work */ + dput(dentry); /* Undo the count from "create" - does all the work */ return 0; } @@ -3464,7 +3456,6 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); - if (IS_ERR(inode)) return PTR_ERR(inode); @@ -3518,8 +3509,7 @@ static void shmem_put_link(void *arg) folio_put(arg); } -static const char *shmem_get_link(struct dentry *dentry, - struct inode *inode, +static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct folio *folio = NULL; @@ -3593,8 +3583,7 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap, * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, - const struct xattr *xattr_array, - void *fs_info) + const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); @@ -3778,7 +3767,6 @@ static struct dentry *shmem_find_alias(struct inode *inode) return alias ?: d_find_any_alias(inode); } - static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -4362,8 +4350,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) } #endif /* CONFIG_TMPFS_QUOTA */ - inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, - VM_NORESERVE); + inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, + S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; @@ -4662,11 +4650,9 @@ static ssize_t shmem_enabled_show(struct kobject *kobj, for (i = 0; i < ARRAY_SIZE(values); i++) { len += sysfs_emit_at(buf, len, - shmem_huge == values[i] ? "%s[%s]" : "%s%s", - i ? " " : "", - shmem_format_huge(values[i])); + shmem_huge == values[i] ? "%s[%s]" : "%s%s", + i ? " " : "", shmem_format_huge(values[i])); } - len += sysfs_emit_at(buf, len, "\n"); return len; @@ -4763,8 +4749,9 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) -static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); @@ -4774,8 +4761,8 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct supe /* common code */ -static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, - unsigned long flags, unsigned int i_flags) +static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, + loff_t size, unsigned long flags, unsigned int i_flags) { struct inode *inode; struct file *res; @@ -4794,7 +4781,6 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); - if (IS_ERR(inode)) { shmem_unacct_size(flags, size); return ERR_CAST(inode); -- cgit From 4199f51a7eb2054d68964efbd8d39c68053a8714 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:30:03 -0700 Subject: shmem: shmem_acct_blocks() and shmem_inode_acct_blocks() By historical accident, shmem_acct_block() and shmem_inode_acct_block() were never pluralized when the pages argument was added, despite their complements being shmem_unacct_blocks() and shmem_inode_unacct_blocks() all along. It has been an irritation: fix their naming at last. Link: https://lkml.kernel.org/r/9124094-e4ab-8be7-ef80-9a87bdc2e4fc@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 3540fc598848..fabce18ed9e5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -189,10 +189,10 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. - * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ -static inline int shmem_acct_block(unsigned long flags, long pages) +static inline int shmem_acct_blocks(unsigned long flags, long pages) { if (!(flags & VM_NORESERVE)) return 0; @@ -207,13 +207,13 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } -static int shmem_inode_acct_block(struct inode *inode, long pages) +static int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int err = -ENOSPC; - if (shmem_acct_block(info->flags, pages)) + if (shmem_acct_blocks(info->flags, pages)) return err; might_sleep(); /* when quotas */ @@ -447,7 +447,7 @@ bool shmem_charge(struct inode *inode, long pages) { struct address_space *mapping = inode->i_mapping; - if (shmem_inode_acct_block(inode, pages)) + if (shmem_inode_acct_blocks(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ @@ -1671,7 +1671,7 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, huge = false; nr = huge ? HPAGE_PMD_NR : 1; - err = shmem_inode_acct_block(inode, nr); + err = shmem_inode_acct_blocks(inode, nr); if (err) goto failed; @@ -2572,7 +2572,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, int ret; pgoff_t max_off; - if (shmem_inode_acct_block(inode, 1)) { + if (shmem_inode_acct_blocks(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to -- cgit From 054a9f7ccd0a60607fb9bbe1e06ca671494971bf Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:31:27 -0700 Subject: shmem: move memcg charge out of shmem_add_to_page_cache() Extract shmem's memcg charging out of shmem_add_to_page_cache(): it's misleading done there, because many calls are dealing with a swapcache page, whose memcg is nowadays always remembered while swapped out, then the charge re-levied when it's brought back into swapcache. Temporarily move it back up to the shmem_get_folio_gfp() level, where the memcg was charged before v5.8; but the next commit goes on to move it back down to a new home. In making this change, it becomes clear that shmem_swapin_folio() does not need to know the vma, just the fault mm (if any): call it fault_mm rather than charge_mm - let mem_cgroup_charge() decide whom to charge. Link: https://lkml.kernel.org/r/4b2143c5-bf32-64f0-841-81a81158dac@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 68 +++++++++++++++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index fabce18ed9e5..670f77de3238 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -146,9 +146,8 @@ static unsigned long shmem_default_max_inodes(void) #endif static int shmem_swapin_folio(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - vm_fault_t *fault_type); + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct mm_struct *fault_mm, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -760,12 +759,10 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, */ static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp, - struct mm_struct *charge_mm) + pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); long nr = folio_nr_pages(folio); - int error; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -776,16 +773,7 @@ static int shmem_add_to_page_cache(struct folio *folio, folio->mapping = mapping; folio->index = index; - if (!folio_test_swapcache(folio)) { - error = mem_cgroup_charge(folio, charge_mm, gfp); - if (error) { - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto error; - } - } + gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); do { @@ -813,15 +801,12 @@ unlock: } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - error = xas_error(&xas); - goto error; + folio->mapping = NULL; + folio_ref_sub(folio, nr); + return xas_error(&xas); } return 0; -error: - folio->mapping = NULL; - folio_ref_sub(folio, nr); - return error; } /* @@ -1324,10 +1309,8 @@ static int shmem_unuse_swap_entries(struct inode *inode, if (!xa_is_value(folio)) continue; - error = shmem_swapin_folio(inode, indices[i], - &folio, SGP_CACHE, - mapping_gfp_mask(mapping), - NULL, NULL); + error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, + mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { folio_unlock(folio); folio_put(folio); @@ -1810,12 +1793,11 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, + gfp_t gfp, struct mm_struct *fault_mm, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; struct swap_info_struct *si; struct folio *folio = NULL; swp_entry_t swap; @@ -1843,7 +1825,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - count_memcg_event_mm(charge_mm, PGMAJFAULT); + count_memcg_event_mm(fault_mm, PGMAJFAULT); } /* Here we actually start the io */ folio = shmem_swapin(swap, gfp, info, index); @@ -1880,8 +1862,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } error = shmem_add_to_page_cache(folio, mapping, index, - swp_to_radix_entry(swap), gfp, - charge_mm); + swp_to_radix_entry(swap), gfp); if (error) goto failed; @@ -1929,7 +1910,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; - struct mm_struct *charge_mm; + struct mm_struct *fault_mm; struct folio *folio; pgoff_t hindex; gfp_t huge_gfp; @@ -1946,7 +1927,7 @@ repeat: } sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : NULL; + fault_mm = vma ? vma->vm_mm : NULL; folio = filemap_get_entry(mapping, index); if (folio && vma && userfaultfd_minor(vma)) { @@ -1958,7 +1939,7 @@ repeat: if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, vma, fault_type); + sgp, gfp, fault_mm, fault_type); if (error == -EEXIST) goto repeat; @@ -2044,9 +2025,16 @@ alloc_nohuge: if (sgp == SGP_WRITE) __folio_set_referenced(folio); - error = shmem_add_to_page_cache(folio, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK, - charge_mm); + error = mem_cgroup_charge(folio, fault_mm, gfp); + if (error) { + if (folio_test_pmd_mappable(folio)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto unacct; + } + + error = shmem_add_to_page_cache(folio, mapping, hindex, NULL, gfp); if (error) goto unacct; @@ -2644,8 +2632,10 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (unlikely(pgoff >= max_off)) goto out_release; - ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm); + ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); + if (ret) + goto out_release; + ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (ret) goto out_release; -- cgit From 3022fd7af9604d44ec43da8a4398872989599b18 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:32:40 -0700 Subject: shmem: _add_to_page_cache() before shmem_inode_acct_blocks() There has been a recurring problem, that when a tmpfs volume is being filled by racing threads, some fail with ENOSPC (or consequent SIGBUS or EFAULT) even though all allocations were within the permitted size. This was a problem since early days, but magnified and complicated by the addition of huge pages. We have often worked around it by adding some slop to the tmpfs size, but it's hard to say how much is needed, and some users prefer not to do that e.g. keeping sparse files in a tightly tailored tmpfs helps to prevent accidental writing to holes. This comes from the allocation sequence: 1. check page cache for existing folio 2. check and reserve from vm_enough_memory 3. check and account from size of tmpfs 4. if huge, check page cache for overlapping folio 5. allocate physical folio, huge or small 6. check and charge from mem cgroup limit 7. add to page cache (but maybe another folio already got in). Concurrent tasks allocating at the same position could deplete the size allowance and fail. Doing vm_enough_memory and size checks before the folio allocation was intentional (to limit the load on the page allocator from this source) and still has some virtue; but memory cgroup never did that, so I think it's better reordered to favour predictable behaviour. 1. check page cache for existing folio 2. if huge, check page cache for overlapping folio 3. allocate physical folio, huge or small 4. check and charge from mem cgroup limit 5. add to page cache (but maybe another folio already got in) 6. check and reserve from vm_enough_memory 7. check and account from size of tmpfs. The folio lock held from allocation onwards ensures that the !uptodate folio cannot be used by others, and can safely be deleted from the cache if checks 6 or 7 subsequently fail (and those waiting on folio lock already check that the folio was not truncated once they get the lock); and the early addition to page cache ensures that racers find it before they try to duplicate the accounting. Seize the opportunity to tidy up shmem_get_folio_gfp()'s ENOSPC retrying, which can be combined inside the new shmem_alloc_and_add_folio(): doing 2 splits twice (once huge, once nonhuge) is not exactly equivalent to trying 5 splits (and giving up early on huge), but let's keep it simple unless more complication proves necessary. Userfaultfd is a foreign country: they do things differently there, and for good reason - to avoid mmap_lock deadlock. Leave ordering in shmem_mfill_atomic_pte() untouched for now, but I would rather like to mesh it better with shmem_get_folio_gfp() in the future. Link: https://lkml.kernel.org/r/22ddd06-d919-33b-1219-56335c1bf28e@google.com Signed-off-by: Hugh Dickins Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Jan Kara Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 229 +++++++++++++++++++++++++++++++------------------------------ 1 file changed, 118 insertions(+), 111 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 670f77de3238..269cd3c1110f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -789,13 +789,11 @@ static int shmem_add_to_page_cache(struct folio *folio, xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_ALLOC); + if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - } - mapping->nrpages += nr; __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); + mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -1612,25 +1610,17 @@ static struct folio *shmem_alloc_hugefolio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct address_space *mapping = info->vfs_inode.i_mapping; - pgoff_t hindex; struct folio *folio; - hindex = round_down(index, HPAGE_PMD_NR); - if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, - XA_PRESENT)) - return NULL; - - shmem_pseudo_vma_init(&pvma, info, hindex); + shmem_pseudo_vma_init(&pvma, info, index); folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); - if (!folio) - count_vm_event(THP_FILE_FALLBACK); + return folio; } static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; struct folio *folio; @@ -1642,36 +1632,101 @@ static struct folio *shmem_alloc_folio(gfp_t gfp, return folio; } -static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge) +static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, + struct inode *inode, pgoff_t index, + struct mm_struct *fault_mm, bool huge) { + struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct folio *folio; - int nr; - int err; + long pages; + int error; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; - nr = huge ? HPAGE_PMD_NR : 1; - err = shmem_inode_acct_blocks(inode, nr); - if (err) - goto failed; + if (huge) { + pages = HPAGE_PMD_NR; + index = round_down(index, HPAGE_PMD_NR); + + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ + if (xa_find(&mapping->i_pages, &index, + index + HPAGE_PMD_NR - 1, XA_PRESENT)) + return ERR_PTR(-E2BIG); - if (huge) folio = shmem_alloc_hugefolio(gfp, info, index); - else + if (!folio) + count_vm_event(THP_FILE_FALLBACK); + } else { + pages = 1; folio = shmem_alloc_folio(gfp, info, index); - if (folio) { - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - return folio; + } + if (!folio) + return ERR_PTR(-ENOMEM); + + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + gfp &= GFP_RECLAIM_MASK; + error = mem_cgroup_charge(folio, fault_mm, gfp); + if (error) { + if (xa_find(&mapping->i_pages, &index, + index + pages - 1, XA_PRESENT)) { + error = -EEXIST; + } else if (huge) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto unlock; } - err = -ENOMEM; - shmem_inode_unacct_blocks(inode, nr); -failed: - return ERR_PTR(err); + error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); + if (error) + goto unlock; + + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + long freed; + /* + * Try to reclaim some space by splitting a few + * large folios beyond i_size on the filesystem. + */ + shmem_unused_huge_shrink(sbinfo, NULL, 2); + /* + * And do a shmem_recalc_inode() to account for freed pages: + * except our folio is there in cache, so not quite balanced. + */ + spin_lock(&info->lock); + freed = pages + info->alloced - info->swapped - + READ_ONCE(mapping->nrpages); + if (freed > 0) + info->alloced -= freed; + spin_unlock(&info->lock); + if (freed > 0) + shmem_inode_unacct_blocks(inode, freed); + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + filemap_remove_folio(folio); + goto unlock; + } + } + + shmem_recalc_inode(inode, pages, 0); + folio_add_lru(folio); + return folio; + +unlock: + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(error); } /* @@ -1907,29 +1962,22 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; - struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo; struct mm_struct *fault_mm; struct folio *folio; - pgoff_t hindex; - gfp_t huge_gfp; int error; - int once = 0; - int alloced = 0; + bool alloced; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: if (sgp <= SGP_CACHE && - ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; - } - sbinfo = SHMEM_SB(inode->i_sb); + alloced = false; fault_mm = vma ? vma->vm_mm : NULL; - folio = filemap_get_entry(mapping, index); + folio = filemap_get_entry(inode->i_mapping, index); if (folio && vma && userfaultfd_minor(vma)) { if (!xa_is_value(folio)) folio_put(folio); @@ -1951,7 +1999,7 @@ repeat: folio_lock(folio); /* Has the folio been truncated or swapped out? */ - if (unlikely(folio->mapping != mapping)) { + if (unlikely(folio->mapping != inode->i_mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; @@ -1986,65 +2034,38 @@ repeat: return 0; } - if (!shmem_is_huge(inode, index, false, - vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) - goto alloc_nohuge; + if (shmem_is_huge(inode, index, false, fault_mm, + vma ? vma->vm_flags : 0)) { + gfp_t huge_gfp; - huge_gfp = vma_thp_gfp_mask(vma); - huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); - if (IS_ERR(folio)) { -alloc_nohuge: - folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); + huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); + folio = shmem_alloc_and_add_folio(huge_gfp, + inode, index, fault_mm, true); + if (!IS_ERR(folio)) { + count_vm_event(THP_FILE_ALLOC); + goto alloced; + } + if (PTR_ERR(folio) == -EEXIST) + goto repeat; } - if (IS_ERR(folio)) { - int retry = 5; + folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false); + if (IS_ERR(folio)) { error = PTR_ERR(folio); + if (error == -EEXIST) + goto repeat; folio = NULL; - if (error != -ENOSPC) - goto unlock; - /* - * Try to reclaim some space by splitting a large folio - * beyond i_size on the filesystem. - */ - while (retry--) { - int ret; - - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); - if (ret == SHRINK_STOP) - break; - if (ret) - goto alloc_nohuge; - } goto unlock; } - hindex = round_down(index, folio_nr_pages(folio)); - - if (sgp == SGP_WRITE) - __folio_set_referenced(folio); - - error = mem_cgroup_charge(folio, fault_mm, gfp); - if (error) { - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto unacct; - } - - error = shmem_add_to_page_cache(folio, mapping, hindex, NULL, gfp); - if (error) - goto unacct; - - folio_add_lru(folio); - shmem_recalc_inode(inode, folio_nr_pages(folio), 0); +alloced: alloced = true; - if (folio_test_pmd_mappable(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < folio_next_index(folio) - 1) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. @@ -2062,6 +2083,8 @@ alloc_nohuge: spin_unlock(&sbinfo->shrinklist_lock); } + if (sgp == SGP_WRITE) + folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ @@ -2085,11 +2108,6 @@ clear: /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { - if (alloced) { - folio_clear_dirty(folio); - filemap_remove_folio(folio); - shmem_recalc_inode(inode, 0, 0); - } error = -EINVAL; goto unlock; } @@ -2100,25 +2118,14 @@ out: /* * Error recovery. */ -unacct: - shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); - - if (folio_test_large(folio)) { - folio_unlock(folio); - folio_put(folio); - goto alloc_nohuge; - } unlock: + if (alloced) + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); if (folio) { folio_unlock(folio); folio_put(folio); } - if (error == -ENOSPC && !once++) { - shmem_recalc_inode(inode, 0, 0); - goto repeat; - } - if (error == -EEXIST) - goto repeat; return error; } -- cgit From beb9868628445306958fd7b2da1cd369a4a381cc Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:42:45 -0700 Subject: shmem,percpu_counter: add _limited_add(fbc, limit, amount) Percpu counter's compare and add are separate functions: without locking around them (which would defeat their purpose), it has been possible to overflow the intended limit. Imagine all the other CPUs fallocating tmpfs huge pages to the limit, in between this CPU's compare and its add. I have not seen reports of that happening; but tmpfs's recent addition of dquot_alloc_block_nodirty() in between the compare and the add makes it even more likely, and I'd be uncomfortable to leave it unfixed. Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it. I believe this implementation is correct, and slightly more efficient than the combination of compare and add (taking the lock once rather than twice when nearing full - the last 128MiB of a tmpfs volume on a machine with 128 CPUs and 4KiB pages); but it does beg for a better design - when nearing full, there is no new batching, but the costly percpu counter sum across CPUs still has to be done, while locked. Follow __percpu_counter_sum()'s example, including cpu_dying_mask as well as cpu_online_mask: but shouldn't __percpu_counter_compare() and __percpu_counter_limited_add() then be adding a num_dying_cpus() to num_online_cpus(), when they calculate the maximum which could be held across CPUs? But the times when it matters would be vanishingly rare. Link: https://lkml.kernel.org/r/bb817848-2d19-bcc8-39ca-ea179af0f0b4@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Tim Chen Cc: Dave Chinner Cc: Darrick J. Wong Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 269cd3c1110f..61b170324e5c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -217,15 +217,15 @@ static int shmem_inode_acct_blocks(struct inode *inode, long pages) might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - pages) > 0) + if (!percpu_counter_limited_add(&sbinfo->used_blocks, + sbinfo->max_blocks, pages)) goto unacct; err = dquot_alloc_block_nodirty(inode, pages); - if (err) + if (err) { + percpu_counter_sub(&sbinfo->used_blocks, pages); goto unacct; - - percpu_counter_add(&sbinfo->used_blocks, pages); + } } else { err = dquot_alloc_block_nodirty(inode, pages); if (err) -- cgit From 5d74b2ab2c15d596c470bae6626f345d5575a9d0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:13 +0100 Subject: mm: make lock_folio_maybe_drop_mmap() VMA lock aware Patch series "Handle more faults under the VMA lock", v2. At this point, we're handling the majority of file-backed page faults under the VMA lock, using the ->map_pages entry point. This patch set attempts to expand that for the following siutations: - We have to do a read. This could be because we've hit the point in the readahead window where we need to kick off the next readahead, or because the page is simply not present in cache. - We're handling a write fault. Most applications don't do I/O by writes to shared mmaps for very good reasons, but some do, and it'd be nice to not make that slow unnecessarily. - We're doing a COW of a private mapping (both PTE already present and PTE not-present). These are two different codepaths and I handle both of them in this patch set. There is no support in this patch set for drivers to mark themselves as being VMA lock friendly; they could implement the ->map_pages vm_operation, but if they do, they would be the first. This is probably something we want to change at some point in the future, and I've marked where to make that change in the code. There is very little performance change in the benchmarks we've run; mostly because the vast majority of page faults are handled through the other paths. I still think this patch series is useful for workloads that may take these paths more often, and just for cleaning up the fault path in general (it's now clearer why we have to retry in these cases). This patch (of 6): Drop the VMA lock instead of the mmap_lock if that's the one which is held. Link: https://lkml.kernel.org/r/20231006195318.4087158-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231006195318.4087158-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/filemap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 062c25b184a5..951709089f38 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3090,7 +3090,7 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, /* * NOTE! This will make us return with VM_FAULT_RETRY, but with - * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT + * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) @@ -3100,13 +3100,14 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__folio_lock_killable(folio)) { /* - * We didn't have the right flags to drop the mmap_lock, - * but all fault_handlers only check for fatal signals - * if we return VM_FAULT_RETRY, so we need to drop the - * mmap_lock here and return 0 if we don't have a fpin. + * We didn't have the right flags to drop the + * fault lock, but all fault_handlers only check + * for fatal signals if we return VM_FAULT_RETRY, + * so we need to drop the fault lock here and + * return 0 if we don't have a fpin. */ if (*fpin == NULL) - mmap_read_unlock(vmf->vma->vm_mm); + release_fault_lock(vmf); return 0; } } else -- cgit From 164b06f238b986317131e6b61b2f22aabcbc2cc0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:14 +0100 Subject: mm: call wp_page_copy() under the VMA lock It is usually safe to call wp_page_copy() under the VMA lock. The only unsafe situation is when no anon_vma has been allocated for this VMA, and we have to look at adjacent VMAs to determine if their anon_vma can be shared. Since this happens only for the first COW of a page in this VMA, the majority of calls to wp_page_copy() do not need to fall back to the mmap_sem. Add vmf_anon_prepare() as an alternative to anon_vma_prepare() which will return RETRY if we currently hold the VMA lock and need to allocate an anon_vma. This lets us drop the check in do_wp_page(). Link: https://lkml.kernel.org/r/20231006195318.4087158-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index d0f61d729fb4..c437d3562d18 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3042,6 +3042,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (likely(vma->anon_vma)) + return 0; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + if (__anon_vma_prepare(vma)) + return VM_FAULT_OOM; + return 0; +} + /* * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. @@ -3069,27 +3084,29 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; - int ret; + vm_fault_t ret; delayacct_wpcopy_start(); if (vmf->page) old_folio = page_folio(vmf->page); - if (unlikely(anon_vma_prepare(vma))) - goto oom; + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) + goto out; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); if (!new_folio) goto oom; } else { + int err; new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address, false); if (!new_folio) goto oom; - ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); - if (ret) { + err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); + if (err) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on @@ -3102,7 +3119,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) folio_put(old_folio); delayacct_wpcopy_end(); - return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; + return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(&new_folio->page, vmf->page); } @@ -3212,11 +3229,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) oom_free_new: folio_put(new_folio); oom: + ret = VM_FAULT_OOM; +out: if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); - return VM_FAULT_OOM; + return ret; } /** @@ -3458,12 +3477,6 @@ reuse: return 0; } copy: - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - /* * Ok, we need to copy. Oh, well.. */ -- cgit From 4ed4379881aa62588aba6442a9f362a8cf7624e6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:15 +0100 Subject: mm: handle shared faults under the VMA lock There are many implementations of ->fault and some of them depend on mmap_lock being held. All vm_ops that implement ->map_pages() end up calling filemap_fault(), which I have audited to be sure it does not rely on mmap_lock. So (for now) key off ->map_pages existing as a flag to indicate that it's safe to call ->fault while only holding the vma lock. Link: https://lkml.kernel.org/r/20231006195318.4087158-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index c437d3562d18..7b240a67f207 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3042,6 +3042,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +/* + * We could add a bitflag somewhere, but for now, we know that all + * vm_ops that have a ->map_pages have been audited and don't need + * the mmap_lock to be held. + */ +static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) + return 0; + vma_end_read(vma); + return VM_FAULT_RETRY; +} + static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -4669,10 +4684,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) vm_fault_t ret, tmp; struct folio *folio; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) -- cgit From 4de8c93a4751e10737b6af65db42c743228c67a6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:16 +0100 Subject: mm: handle COW faults under the VMA lock If the page is not currently present in the page tables, we need to call the page fault handler to find out which page we're supposed to COW, so we need to both check that there is already an anon_vma and that the fault handler doesn't need the mmap_lock. Link: https://lkml.kernel.org/r/20231006195318.4087158-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7b240a67f207..be9469a29413 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4639,13 +4639,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + ret = vmf_can_call_fault(vmf); + if (!ret) + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!vmf->cow_page) -- cgit From 12214eba1992642eee5813a9cc9f626e5b2d1815 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:17 +0100 Subject: mm: handle read faults under the VMA lock Most file-backed faults are already handled through ->map_pages(), but if we need to do I/O we'll come this way. Since filemap_fault() is now safe to be called under the VMA lock, we can handle these faults under the VMA lock now. Link: https://lkml.kernel.org/r/20231006195318.4087158-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index be9469a29413..ada461b82a75 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4617,10 +4617,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) return ret; } - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) -- cgit From 4a68fef16df9d88d528094116f8bbd2dbfa62089 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:18 +0100 Subject: mm: handle write faults to RO pages under the VMA lock I think this is a pretty rare occurrence, but for consistency handle faults with the VMA lock held the same way that we handle other faults with the VMA lock held. Link: https://lkml.kernel.org/r/20231006195318.4087158-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index ada461b82a75..ceffe96f2a0e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3301,10 +3301,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); @@ -3328,10 +3327,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + tmp = vmf_can_call_fault(vmf); + if (tmp) { folio_put(folio); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; + return tmp; } tmp = do_page_mkwrite(vmf, folio); -- cgit From 5ca432896a4ce6d69fffc3298b24c0dd9bdb871f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:47 +0200 Subject: mm/rmap: move SetPageAnonExclusive() out of page_move_anon_rmap() Patch series "mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap()". Convert page_move_anon_rmap() to folio_move_anon_rmap(), letting the callers handle PageAnonExclusive. I'm including cleanup patch #3 because it fits into the picture and can be done cleaner by the conversion. This patch (of 3): Let's move it into the caller: there is a difference between whether an anon folio can only be mapped by one process (e.g., into one VMA), and whether it is truly exclusive (e.g., no references -- including GUP -- from other processes). Further, for large folios the page might not actually be pointing at the head page of the folio, so it better be handled in the caller. This is a preparation for converting page_move_anon_rmap() to consume a folio. Link: https://lkml.kernel.org/r/20231002142949.235104-1-david@redhat.com Link: https://lkml.kernel.org/r/20231002142949.235104-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/huge_memory.c | 1 + mm/hugetlb.c | 4 +++- mm/memory.c | 1 + mm/rmap.c | 1 - 4 files changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index aa0224556132..76ead290f1c8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1377,6 +1377,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) pmd_t entry; page_move_anon_rmap(page, vma); + SetPageAnonExclusive(page); folio_unlock(folio); reuse: if (unlikely(unshare)) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2878e0e6bac5..35d924f74972 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5652,8 +5652,10 @@ retry_avoidcopy: * owner and can reuse this page. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { - if (!PageAnonExclusive(&old_folio->page)) + if (!PageAnonExclusive(&old_folio->page)) { page_move_anon_rmap(&old_folio->page, vma); + SetPageAnonExclusive(&old_folio->page); + } if (likely(!unshare)) set_huge_ptep_writable(vma, haddr, ptep); diff --git a/mm/memory.c b/mm/memory.c index ceffe96f2a0e..21fba1c9a6c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3481,6 +3481,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * sunglasses. Hit it. */ page_move_anon_rmap(vmf->page, vma); + SetPageAnonExclusive(vmf->page); folio_unlock(folio); reuse: if (unlikely(unshare)) { diff --git a/mm/rmap.c b/mm/rmap.c index c6bb2339e35b..6f1ea1491118 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1152,7 +1152,6 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) * folio_test_anon()) will not see one without the other. */ WRITE_ONCE(folio->mapping, anon_vma); - SetPageAnonExclusive(page); } /** -- cgit From 069686255c16a75b6a796e42df47f5af27b496a4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:48 +0200 Subject: mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap() Let's convert it to consume a folio. [akpm@linux-foundation.org: fix kerneldoc] Link: https://lkml.kernel.org/r/20231002142949.235104-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/hugetlb.c | 2 +- mm/memory.c | 2 +- mm/rmap.c | 16 +++++++--------- 4 files changed, 10 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76ead290f1c8..51a66eb48938 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,7 +1376,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) if (folio_ref_count(folio) == 1) { pmd_t entry; - page_move_anon_rmap(page, vma); + folio_move_anon_rmap(folio, vma); SetPageAnonExclusive(page); folio_unlock(folio); reuse: diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 35d924f74972..7ad9d2159da4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5653,7 +5653,7 @@ retry_avoidcopy: */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { - page_move_anon_rmap(&old_folio->page, vma); + folio_move_anon_rmap(old_folio, vma); SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) diff --git a/mm/memory.c b/mm/memory.c index 21fba1c9a6c7..6c67828f934c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3480,7 +3480,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ - page_move_anon_rmap(vmf->page, vma); + folio_move_anon_rmap(folio, vma); SetPageAnonExclusive(vmf->page); folio_unlock(folio); reuse: diff --git a/mm/rmap.c b/mm/rmap.c index 6f1ea1491118..7a27a2b41802 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1128,19 +1128,17 @@ int folio_total_mapcount(struct folio *folio) } /** - * page_move_anon_rmap - move a page to our anon_vma - * @page: the page to move to our anon_vma - * @vma: the vma the page belongs to + * folio_move_anon_rmap - move a folio to our anon_vma + * @folio: The folio to move to our anon_vma + * @vma: The vma the folio belongs to * - * When a page belongs exclusively to one process after a COW event, - * that page can be moved into the anon_vma that belongs to just that - * process, so the rmap code will not search the parent or sibling - * processes. + * When a folio belongs exclusively to one process after a COW event, + * that folio can be moved into the anon_vma that belongs to just that + * process, so the rmap code will not search the parent or sibling processes. */ -void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) +void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) { void *anon_vma = vma->anon_vma; - struct folio *folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); -- cgit From dec078cc2181fccf8b134406b86aaacc19f7163f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:49 +0200 Subject: memory: move exclusivity detection in do_wp_page() into wp_can_reuse_anon_folio() Let's clean up do_wp_page() a bit, removing two labels and making it a easier to read. wp_can_reuse_anon_folio() now only operates on the whole folio. Move the SetPageAnonExclusive() out into do_wp_page(). No need to do this under page lock -- the page table lock is sufficient. Link: https://lkml.kernel.org/r/20231002142949.235104-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Mike Kravetz Cc: Muchun Song Cc: Suren Baghdasaryan Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/memory.c | 88 +++++++++++++++++++++++++++++++------------------------------ 1 file changed, 45 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 6c67828f934c..22784d3b886d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3355,6 +3355,44 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) return ret; } +static bool wp_can_reuse_anon_folio(struct folio *folio, + struct vm_area_struct *vma) +{ + /* + * We have to verify under folio lock: these early checks are + * just an optimization to avoid locking the folio and freeing + * the swapcache if there is little hope that we can reuse. + * + * KSM doesn't necessarily raise the folio refcount. + */ + if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) + return false; + if (!folio_test_lru(folio)) + /* + * We cannot easily detect+handle references from + * remote LRU caches or references to LRU folios. + */ + lru_add_drain(); + if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) + return false; + if (!folio_trylock(folio)) + return false; + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { + folio_unlock(folio); + return false; + } + /* + * Ok, we've got the only folio reference from our mapping + * and the folio is locked, it's dark out, and we're wearing + * sunglasses. Hit it. + */ + folio_move_anon_rmap(folio, vma); + folio_unlock(folio); + return true; +} + /* * This routine handles present pages, when * * users try to write to a shared page (FAULT_FLAG_WRITE) @@ -3441,49 +3479,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) /* * Private mapping: create an exclusive anonymous page copy if reuse * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. + * + * If we encounter a page that is marked exclusive, we must reuse + * the page without further checks. */ - if (folio && folio_test_anon(folio)) { - /* - * If the page is exclusive to this process we must reuse the - * page without further checks. - */ - if (PageAnonExclusive(vmf->page)) - goto reuse; - - /* - * We have to verify under folio lock: these early checks are - * just an optimization to avoid locking the folio and freeing - * the swapcache if there is little hope that we can reuse. - * - * KSM doesn't necessarily raise the folio refcount. - */ - if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) - goto copy; - if (!folio_test_lru(folio)) - /* - * We cannot easily detect+handle references from - * remote LRU caches or references to LRU folios. - */ - lru_add_drain(); - if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) - goto copy; - if (!folio_trylock(folio)) - goto copy; - if (folio_test_swapcache(folio)) - folio_free_swap(folio); - if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { - folio_unlock(folio); - goto copy; - } - /* - * Ok, we've got the only folio reference from our mapping - * and the folio is locked, it's dark out, and we're wearing - * sunglasses. Hit it. - */ - folio_move_anon_rmap(folio, vma); - SetPageAnonExclusive(vmf->page); - folio_unlock(folio); -reuse: + if (folio && folio_test_anon(folio) && + (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) { + if (!PageAnonExclusive(vmf->page)) + SetPageAnonExclusive(vmf->page); if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3491,7 +3494,6 @@ reuse: wp_page_reuse(vmf); return 0; } -copy: /* * Ok, we need to copy. Oh, well.. */ -- cgit From ec47e250628903d48d9ff490c6b532df889bcaa3 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Tue, 3 Oct 2023 10:48:57 -0400 Subject: mm/migrate: remove unused mm argument from do_move_pages_to_node This function does not actively use the mm_struct, it can be removed. Link: https://lkml.kernel.org/r/20231003144857.752952-2-gregory.price@memverge.com Signed-off-by: Gregory Price Reviewed-by: Jonathan Cameron Cc: Arnd Bergmann Cc: Gregory Price Signed-off-by: Andrew Morton --- mm/migrate.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 482f478ea3fe..9d9eee5322d1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2029,8 +2029,7 @@ static int store_status(int __user *status, int start, int value, int nr) return 0; } -static int do_move_pages_to_node(struct mm_struct *mm, - struct list_head *pagelist, int node) +static int do_move_pages_to_node(struct list_head *pagelist, int node) { int err; struct migration_target_control mtc = { @@ -2119,7 +2118,7 @@ out: return err; } -static int move_pages_and_store_status(struct mm_struct *mm, int node, +static int move_pages_and_store_status(int node, struct list_head *pagelist, int __user *status, int start, int i, unsigned long nr_pages) { @@ -2128,7 +2127,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node, if (list_empty(pagelist)) return 0; - err = do_move_pages_to_node(mm, pagelist, node); + err = do_move_pages_to_node(pagelist, node); if (err) { /* * Positive err means the number of failed @@ -2196,7 +2195,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, current_node = node; start = i; } else if (node != current_node) { - err = move_pages_and_store_status(mm, current_node, + err = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err) goto out; @@ -2231,7 +2230,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, if (err) goto out_flush; - err = move_pages_and_store_status(mm, current_node, &pagelist, + err = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err) { /* We have accounted for page i */ @@ -2243,7 +2242,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, } out_flush: /* Make sure we do not overwrite the existing error */ - err1 = move_pages_and_store_status(mm, current_node, &pagelist, + err1 = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err >= 0) err = err1; -- cgit From 8c2214fc9a470aee0c615aeb14d8c7ce98e45a08 Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Tue, 3 Oct 2023 20:41:55 +0900 Subject: mm: multi-gen LRU: reuse some legacy trace events As the legacy lru provides, the mglru needs some trace events for debugging. Let's reuse following legacy events for the mglru. trace_mm_vmscan_lru_isolate trace_mm_vmscan_lru_shrink_inactive Here's an example mm_vmscan_lru_isolate: classzone=2 order=0 nr_requested=4096 nr_scanned=64 nr_skipped=0 nr_taken=64 lru=inactive_file mm_vmscan_lru_shrink_inactive: nid=0 nr_scanned=64 nr_reclaimed=63 nr_dirty=0 nr_writeback=0 nr_congested=0 nr_immediate=0 nr_activate_anon=0 nr_activate_file=1 nr_ref_keep=0 nr_unmap_fail=0 priority=2 flags=RECLAIM_WB_FILE|RECLAIM_WB_ASYNC Link: https://lkml.kernel.org/r/20231003114155.21869-1-jaewon31.kim@samsung.com Signed-off-by: Jaewon Kim Acked-by: Yu Zhao Cc: Johannes Weiner Cc: Kalesh Singh Cc: SeongJae Park Cc: Steven Rostedt (Google) Cc: T.J. Mercier Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 55424215f759..506f8220c5fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4304,6 +4304,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int sorted = 0; int scanned = 0; int isolated = 0; + int skipped = 0; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -4317,7 +4318,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, for (i = MAX_NR_ZONES; i > 0; i--) { LIST_HEAD(moved); - int skipped = 0; + int skipped_zone = 0; int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; struct list_head *head = &lrugen->folios[gen][type][zone]; @@ -4339,16 +4340,17 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, isolated += delta; } else { list_move(&folio->lru, &moved); - skipped += delta; + skipped_zone += delta; } - if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) + if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH) break; } - if (skipped) { + if (skipped_zone) { list_splice(&moved, head); - __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone); + skipped += skipped_zone; } if (!remaining || isolated >= MIN_LRU_BATCH) @@ -4363,6 +4365,9 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, __count_memcg_events(memcg, item, isolated); __count_memcg_events(memcg, PGREFILL, sorted); __count_vm_events(PGSCAN_ANON + type, isolated); + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, + scanned, skipped, isolated, + type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); /* * There might not be eligible folios due to reclaim_idx. Check the @@ -4493,6 +4498,9 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap retry: reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); sc->nr_reclaimed += reclaimed; + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + scanned, reclaimed, &stat, sc->priority, + type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { if (!folio_evictable(folio)) { -- cgit From c43cfa42541c04a3a94312e39ab81c41ba431277 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:51 +0100 Subject: mm: make __access_remote_vm() static Patch series "various improvements to the GUP interface", v2. A series of fixes to simplify and improve the GUP interface with an eye to providing groundwork to future improvements:- * __access_remote_vm() and access_remote_vm() are functionally identical, so make the former static such that in future we can potentially change the external-facing implementation details of this function. * Extend is_valid_gup_args() to cover the missing FOLL_TOUCH case, and simplify things by defining INTERNAL_GUP_FLAGS to check against. * Adjust __get_user_pages_locked() to explicitly treat a failure to pin any pages as an error in all circumstances other than FOLL_NOWAIT being specified, bringing it in line with the nommu implementation of this function. * (With many thanks to Arnd who suggested this in the first instance) Update get_user_page_vma_remote() to explicitly only return a page or an error, simplifying the interface and avoiding the questionable IS_ERR_OR_NULL() pattern. This patch (of 4): access_remote_vm() passes through parameters to __access_remote_vm() directly, so remove the __access_remote_vm() function from mm.h and use access_remote_vm() in the one caller that needs it (ptrace_access_vm()). This allows future adjustments to the GUP-internal __access_remote_vm() function while keeping the access_remote_vm() function stable. Link: https://lkml.kernel.org/r/cover.1696288092.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/f7877c5039ce1c202a514a8aeeefc5cdd5e32d19.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Catalin Marinas Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memory.c | 4 ++-- mm/nommu.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 22784d3b886d..e47c36c0aef0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5791,8 +5791,8 @@ EXPORT_SYMBOL_GPL(generic_access_phys); /* * Access another process' address space as given in mm. */ -int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, - int len, unsigned int gup_flags) +static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int write = gup_flags & FOLL_WRITE; diff --git a/mm/nommu.c b/mm/nommu.c index 7f9e9e5a0e12..f9553579389b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1651,8 +1651,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, } EXPORT_SYMBOL(filemap_map_pages); -int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, - int len, unsigned int gup_flags) +static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; -- cgit From 0f20bba1688bdf3b32df0162511a67d4eda15790 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:52 +0100 Subject: mm/gup: explicitly define and check internal GUP flags, disallow FOLL_TOUCH Rather than open-coding a list of internal GUP flags in is_valid_gup_args(), define which ones are internal. In addition, explicitly check to see if the user passed in FOLL_TOUCH somehow, as this appears to have been accidentally excluded. Link: https://lkml.kernel.org/r/971e013dfe20915612ea8b704e801d7aef9a66b6.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Catalin Marinas Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/gup.c | 5 ++--- mm/internal.h | 3 +++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 2f8a2d89fde1..b21b33d1787e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2227,12 +2227,11 @@ static bool is_valid_gup_args(struct page **pages, int *locked, /* * These flags not allowed to be specified externally to the gup * interfaces: - * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only + * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only * - FOLL_REMOTE is internal only and used on follow_page() * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL */ - if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE | - FOLL_REMOTE | FOLL_FAST_ONLY))) + if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS)) return false; gup_flags |= to_set; diff --git a/mm/internal.h b/mm/internal.h index 5a5c923725d3..2b79da9deb64 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1017,6 +1017,9 @@ enum { FOLL_UNLOCKABLE = 1 << 21, }; +#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ + FOLL_FAST_ONLY | FOLL_UNLOCKABLE) + /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the -- cgit From 9c4b21422507035f3e0a507a680c9b03c0bcc730 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:53 +0100 Subject: mm/gup: make failure to pin an error if FOLL_NOWAIT not specified There really should be no circumstances under which a non-FOLL_NOWAIT GUP operation fails to return any pages, so make this an error and warn on it. To catch the trivial case, simply exit early if nr_pages == 0. This brings __get_user_pages_locked() in line with the behaviour of its nommu variant. Link: https://lkml.kernel.org/r/2a42d96dd1e37163f90a0019a541163dafb7e4c3.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Reviewed-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Catalin Marinas Cc: Ian Rogers Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/gup.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index b21b33d1787e..231711efa390 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1471,6 +1471,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, long ret, pages_done; bool must_unlock = false; + if (!nr_pages) + return 0; + /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. @@ -1595,6 +1598,14 @@ retry: mmap_read_unlock(mm); *locked = 0; } + + /* + * Failing to pin anything implies something has gone wrong (except when + * FOLL_NOWAIT is specified). + */ + if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT))) + return -EFAULT; + return pages_done; } -- cgit From 6a1960b8a8773324d870fa32ba68ff3106523a95 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:54 +0100 Subject: mm/gup: adapt get_user_page_vma_remote() to never return NULL get_user_pages_remote() will never return 0 except in the case of FOLL_NOWAIT being specified, which we explicitly disallow. This simplifies error handling for the caller and avoids the awkwardness of dealing with both errors and failing to pin. Failing to pin here is an error. Link: https://lkml.kernel.org/r/00319ce292d27b3aae76a0eb220ce3f528187508.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Suggested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Acked-by: Catalin Marinas Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e47c36c0aef0..1f88e4f6fbf2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5815,7 +5815,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); - if (IS_ERR_OR_NULL(page)) { + if (IS_ERR(page)) { /* We might need to expand the stack to access it */ vma = vma_lookup(mm, addr); if (!vma) { @@ -5829,7 +5829,6 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, continue; } - /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. -- cgit From d7196d87a1559db9ece24852e6167061b199d58d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 6 Oct 2023 17:18:43 +0200 Subject: kasan: unify printk prefixes Unify prefixes for printk messages in mm/kasan/. Link: https://lkml.kernel.org/r/35589629806cf0840e5f01ec9d8011a7bad648df.1696605143.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 2 +- mm/kasan/kasan_test_module.c | 2 +- mm/kasan/quarantine.c | 4 +++- mm/kasan/report_generic.c | 6 +++--- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index b61cc6a42541..c707d6c6e019 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -5,7 +5,7 @@ * Author: Andrey Ryabinin */ -#define pr_fmt(fmt) "kasan_test: " fmt +#define pr_fmt(fmt) "kasan: test: " fmt #include #include diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c index 7be7bed456ef..8b7b3ea2c74e 100644 --- a/mm/kasan/kasan_test_module.c +++ b/mm/kasan/kasan_test_module.c @@ -5,7 +5,7 @@ * Author: Andrey Ryabinin */ -#define pr_fmt(fmt) "kasan test: %s " fmt, __func__ +#define pr_fmt(fmt) "kasan: test: " fmt #include #include diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 152dca73f398..ca4529156735 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -8,6 +8,8 @@ * Based on code by Dmitry Chernenkov. */ +#define pr_fmt(fmt) "kasan: " fmt + #include #include #include @@ -414,7 +416,7 @@ static int __init kasan_cpu_quarantine_init(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/kasan:online", kasan_cpu_online, kasan_cpu_offline); if (ret < 0) - pr_err("kasan cpu quarantine register failed [%d]\n", ret); + pr_err("cpu quarantine register failed [%d]\n", ret); return ret; } late_initcall(kasan_cpu_quarantine_init); diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 51a1e8a8877f..99cbcd73cff7 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -220,7 +220,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr, const size_t tok_len = sep - *frame_descr; if (tok_len + 1 > max_tok_len) { - pr_err("KASAN internal error: frame description too long: %s\n", + pr_err("internal error: frame description too long: %s\n", *frame_descr); return false; } @@ -233,7 +233,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr, *frame_descr = sep + 1; if (value != NULL && kstrtoul(token, 10, value)) { - pr_err("KASAN internal error: not a valid number: %s\n", token); + pr_err("internal error: not a valid number: %s\n", token); return false; } @@ -323,7 +323,7 @@ static bool __must_check get_address_stack_frame_info(const void *addr, frame = (const unsigned long *)(mem_ptr + KASAN_GRANULE_SIZE); if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) { - pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n", + pr_err("internal error: frame has invalid marker: %lu\n", frame[0]); return false; } -- cgit From 01a5ad81637672940844052404678678a0ec8854 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 6 Oct 2023 17:18:44 +0200 Subject: kasan: use unchecked __memset internally KASAN code is supposed to use the unchecked __memset implementation when accessing its metadata. Change uses of memset to __memset in mm/kasan/. Link: https://lkml.kernel.org/r/6f621966c6f52241b5aaa7220c348be90c075371.1696605143.git.andreyknvl@google.com Fixes: 59e6e098d1c1 ("kasan: introduce kasan_complete_mode_report_info") Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/kasan/report.c | 4 ++-- mm/kasan/shadow.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 6e3cb118d20e..e77facb62900 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -538,7 +538,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty start_report(&flags, true); - memset(&info, 0, sizeof(info)); + __memset(&info, 0, sizeof(info)); info.type = type; info.access_addr = ptr; info.access_size = 0; @@ -576,7 +576,7 @@ bool kasan_report(const void *addr, size_t size, bool is_write, start_report(&irq_flags, true); - memset(&info, 0, sizeof(info)); + __memset(&info, 0, sizeof(info)); info.type = KASAN_REPORT_ACCESS; info.access_addr = addr; info.access_size = size; diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index dd772f9d0f08..d687f09a7ae3 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -324,7 +324,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, if (!page) return -ENOMEM; - memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); + __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); -- cgit From ff093a9632d9de9ff66dfd9db211008138520e13 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 6 Oct 2023 17:18:45 +0200 Subject: kasan: fix and update KUNIT_EXPECT_KASAN_FAIL comment Update the comment for KUNIT_EXPECT_KASAN_FAIL to describe the parameters this macro accepts. Also drop the mention of the "kasan_status" KUnit resource, as it no longer exists. Link: https://lkml.kernel.org/r/6fad6661e72c407450ae4b385c71bc4a7e1579cd.1696605143.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308171757.7V5YUcje-lkp@intel.com/ Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index c707d6c6e019..2030c7ff7de9 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -91,10 +91,11 @@ static void kasan_test_exit(struct kunit *test) } /** - * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a - * KASAN report; causes a test failure otherwise. This relies on a KUnit - * resource named "kasan_status". Do not use this name for KUnit resources - * outside of KASAN tests. + * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a + * KASAN report; causes a KUnit test failure otherwise. + * + * @test: Currently executing KUnit test. + * @expression: Expression that must produce a KASAN report. * * For hardware tag-based KASAN, when a synchronous tag fault happens, tag * checking is auto-disabled. When this happens, this test handler reenables -- cgit From 9a12d103f7d2d524f5e3d4358b9a9c03e4a9f1d2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 29 Sep 2023 14:30:41 -0400 Subject: mmap: add clarifying comment to vma_merge() code When tracing through the code in vma_merge(), it was not completely clear why the error return to a dup_anon_vma() call would not overwrite a previous attempt to the same function. This commit adds a comment specifying why it is safe. Link: https://lkml.kernel.org/r/20230929183041.2835469-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Jann Horn Link: https://lore.kernel.org/linux-mm/CAG48ez3iDwFPR=Ed1BfrNuyUJPMK_=StjxhUsCkL6po1s7bONg@mail.gmail.com/ Reviewed-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 011ba48d9e72..673429ee8a9e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -943,6 +943,11 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_start_write(curr); remove = curr; remove2 = next; + /* + * Note that the dup_anon_vma below cannot overwrite err + * since the first caller would do nothing unless next + * has an anon_vma. + */ if (!next->anon_vma) err = dup_anon_vma(prev, curr, &anon_dup); } -- cgit From 27e0db3c21aaf1422980e64b77956e15b839306f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 27 Sep 2023 18:35:13 +0800 Subject: mm/page_alloc: remove unnecessary check in break_down_buddy_pages Patch series "Two minor cleanups to break_down_buddy_pages", v2. Two minor cleanups to break_down_buddy_pages. This patch (of 2): 1. We always have target in range started with next_page and full free range started with current_buddy. 2. The last split range size is 1 << low and low should be >= 0, then size >= 1. So page + size != page is always true (because size > 0). As summary, current_page will not equal to target page. Link: https://lkml.kernel.org/r/20230927103514.98281-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230927103514.98281-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Naoya Horiguchi Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 85741403948f..fdb68b7c8240 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6480,10 +6480,8 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, if (set_page_guard(zone, current_buddy, high, migratetype)) continue; - if (current_buddy != target) { - add_to_free_list(current_buddy, zone, high, migratetype); - set_buddy_order(current_buddy, high); - } + add_to_free_list(current_buddy, zone, high, migratetype); + set_buddy_order(current_buddy, high); } } -- cgit From 0dfca313a009c83e2ad44b3719dc1222df6c6db5 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 27 Sep 2023 18:35:14 +0800 Subject: mm/page_alloc: remove unnecessary next_page in break_down_buddy_pages The next_page is only used to forward page in case target is in second half range. Move forward page directly to remove unnecessary next_page. Link: https://lkml.kernel.org/r/20230927103514.98281-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Naoya Horiguchi Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fdb68b7c8240..f46e519618a0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6462,20 +6462,18 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, int migratetype) { unsigned long size = 1 << high; - struct page *current_buddy, *next_page; + struct page *current_buddy; while (high > low) { high--; size >>= 1; if (target >= &page[size]) { - next_page = page + size; current_buddy = page; + page = page + size; } else { - next_page = page; current_buddy = page + size; } - page = next_page; if (set_page_guard(zone, current_buddy, high, migratetype)) continue; -- cgit From bafd7e9d353ea091958d0dc702390add34b480e9 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Fri, 6 Oct 2023 13:01:20 +0200 Subject: filemap: call filemap_get_folios_tag() from filemap_get_folios() filemap_get_folios() is filemap_get_folios_tag() with XA_PRESENT as the tag that is being matched. Return filemap_get_folios_tag() with XA_PRESENT as the tag instead of duplicating the code in filemap_get_folios(). No functional changes. Link: https://lkml.kernel.org/r/20231006110120.136809-1-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 45 ++++++++------------------------------------- 1 file changed, 8 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 951709089f38..cd872fbc6086 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2120,48 +2120,13 @@ put: * index @start and up to index @end (inclusive). The folios are returned * in @fbatch with an elevated reference count. * - * The first folio may start before @start; if it does, it will contain - * @start. The final folio may extend beyond @end; if it does, it will - * contain @end. The folios have ascending indices. There may be gaps - * between the folios if there are indices which have no folio in the - * page cache. If folios are added to or removed from the page cache - * while this is running, they may or may not be found by this call. - * * Return: The number of folios which were found. * We also update @start to index the next folio for the traversal. */ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { - XA_STATE(xas, &mapping->i_pages, *start); - struct folio *folio; - - rcu_read_lock(); - while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { - /* Skip over shadow, swap and DAX entries */ - if (xa_is_value(folio)) - continue; - if (!folio_batch_add(fbatch, folio)) { - unsigned long nr = folio_nr_pages(folio); - *start = folio->index + nr; - goto out; - } - } - - /* - * We come here when there is no page beyond @end. We take care to not - * overflow the index @start as it confuses some of the callers. This - * breaks the iteration when there is a page at index -1 but that is - * already broken anyway. - */ - if (end == (pgoff_t)-1) - *start = (pgoff_t)-1; - else - *start = end + 1; -out: - rcu_read_unlock(); - - return folio_batch_count(fbatch); + return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch); } EXPORT_SYMBOL(filemap_get_folios); @@ -2240,7 +2205,13 @@ EXPORT_SYMBOL(filemap_get_folios_contig); * @tag: The tag index * @fbatch: The batch to fill * - * Same as filemap_get_folios(), but only returning folios tagged with @tag. + * The first folio may start before @start; if it does, it will contain + * @start. The final folio may extend beyond @end; if it does, it will + * contain @end. The folios have ascending indices. There may be gaps + * between the folios if there are indices which have no folio in the + * page cache. If folios are added to or removed from the page cache + * while this is running, they may or may not be found by this call. + * Only returns folios that are tagged with @tag. * * Return: The number of folios found. * Also update @start to index the next folio for traversal. -- cgit From afb2d666d0250e707eef3c5947165b414268244b Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Fri, 6 Oct 2023 14:02:40 +0800 Subject: zsmalloc: use copy_page for full page copy Some architectures have implemented optimized copy_page for full page copying, such as arm. On my arm platform, use the copy_page helper for single page copying is about 10 percent faster than memcpy. Link: https://lkml.kernel.org/r/20231006060245.7411-1-mark-pk.tsai@mediatek.com Signed-off-by: Mark-PK Tsai Reviewed-by: Sergey Senozhatsky Cc: AngeloGioacchino Del Regno Cc: Matthias Brugger Cc: Minchan Kim Cc: YJ Chiang Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c743ce7a5f49..b1c0dad7f4cf 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1839,7 +1839,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, * Here, any user cannot access all objects in the zspage so let's move. */ d_addr = kmap_atomic(newpage); - memcpy(d_addr, s_addr, PAGE_SIZE); + copy_page(d_addr, s_addr); kunmap_atomic(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; -- cgit From 0b237047d5a72ffe06c0bdf2f4536f669dcd31c9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:03 +0100 Subject: mm: add folio_end_read() Provide a function for filesystems to call when they have finished reading an entire folio. Link: https://lkml.kernel.org/r/20231004165317.1061855-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- mm/filemap.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index cd872fbc6086..1ce78c619294 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1525,6 +1525,28 @@ void folio_unlock(struct folio *folio) } EXPORT_SYMBOL(folio_unlock); +/** + * folio_end_read - End read on a folio. + * @folio: The folio. + * @success: True if all reads completed successfully. + * + * When all reads against a folio have completed, filesystems should + * call this function to let the pagecache know that no more reads + * are outstanding. This will unlock the folio and wake up any thread + * sleeping on the lock. The folio will also be marked uptodate if all + * reads succeeded. + * + * Context: May be called from interrupt or process context. May not be + * called from NMI context. + */ +void folio_end_read(struct folio *folio, bool success) +{ + if (likely(success)) + folio_mark_uptodate(folio); + folio_unlock(folio); +} +EXPORT_SYMBOL(folio_end_read); + /** * folio_end_private_2 - Clear PG_private_2 and wake any waiters. * @folio: The folio. -- cgit From 247dbcdbf790c52fc76cf8e327cd0a5778e41e66 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:07 +0100 Subject: bitops: add xor_unlock_is_negative_byte() Replace clear_bit_and_unlock_is_negative_byte() with xor_unlock_is_negative_byte(). We have a few places that like to lock a folio, set a flag and unlock it again. Allow for the possibility of combining the latter two operations for efficiency. We are guaranteed that the caller holds the lock, so it is safe to unlock it with the xor. The caller must guarantee that nobody else will set the flag without holding the lock; it is not safe to do this with the PG_dirty flag, for example. Link: https://lkml.kernel.org/r/20231004165317.1061855-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- mm/filemap.c | 5 +++++ mm/kasan/kasan_test.c | 7 ++++--- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 1ce78c619294..c637863f4643 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1482,6 +1482,11 @@ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) } EXPORT_SYMBOL_GPL(folio_add_wait_queue); +#ifdef xor_unlock_is_negative_byte +#define clear_bit_unlock_is_negative_byte(nr, p) \ + xor_unlock_is_negative_byte(1 << nr, p) +#endif + #ifndef clear_bit_unlock_is_negative_byte /* diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 2030c7ff7de9..821c9ea0473a 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -1099,9 +1099,10 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); -#if defined(clear_bit_unlock_is_negative_byte) - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = - clear_bit_unlock_is_negative_byte(nr, addr)); +#if defined(xor_unlock_is_negative_byte) + if (nr < 7) + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = + xor_unlock_is_negative_byte(1 << nr, addr)); #endif } -- cgit From f12fb73b74fd23ca33e3f95fb996f295eeae1da7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:14 +0100 Subject: mm: delete checks for xor_unlock_is_negative_byte() Architectures which don't define their own use the one in asm-generic/bitops/lock.h. Get rid of all the ifdefs around "maybe we don't have it". Link: https://lkml.kernel.org/r/20231004165317.1061855-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Geert Uytterhoeven Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- mm/filemap.c | 30 +----------------------------- mm/kasan/kasan_test.c | 3 --- 2 files changed, 1 insertion(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index c637863f4643..458377e9a184 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1482,34 +1482,6 @@ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) } EXPORT_SYMBOL_GPL(folio_add_wait_queue); -#ifdef xor_unlock_is_negative_byte -#define clear_bit_unlock_is_negative_byte(nr, p) \ - xor_unlock_is_negative_byte(1 << nr, p) -#endif - -#ifndef clear_bit_unlock_is_negative_byte - -/* - * PG_waiters is the high bit in the same byte as PG_lock. - * - * On x86 (and on many other architectures), we can clear PG_lock and - * test the sign bit at the same time. But if the architecture does - * not support that special operation, we just do this all by hand - * instead. - * - * The read of PG_waiters has to be after (or concurrently with) PG_locked - * being cleared, but a memory barrier should be unnecessary since it is - * in the same byte as PG_locked. - */ -static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) -{ - clear_bit_unlock(nr, mem); - /* smp_mb__after_atomic(); */ - return test_bit(PG_waiters, mem); -} - -#endif - /** * folio_unlock - Unlock a locked folio. * @folio: The folio. @@ -1525,7 +1497,7 @@ void folio_unlock(struct folio *folio) BUILD_BUG_ON(PG_waiters != 7); BUILD_BUG_ON(PG_locked > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0))) + if (xor_unlock_is_negative_byte(1 << PG_locked, folio_flags(folio, 0))) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_unlock); diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 821c9ea0473a..8281eb42464b 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -1098,12 +1098,9 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); - -#if defined(xor_unlock_is_negative_byte) if (nr < 7) KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = xor_unlock_is_negative_byte(1 << nr, addr)); -#endif } static void kasan_bitops_generic(struct kunit *test) -- cgit From 0410cd844ed0af3db3cb510d877d62c66d26e5cc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:15 +0100 Subject: mm: add folio_xor_flags_has_waiters() Optimise folio_end_read() by setting the uptodate bit at the same time we clear the unlock bit. This saves at least one memory barrier and one write-after-write hazard. Link: https://lkml.kernel.org/r/20231004165317.1061855-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- mm/filemap.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 458377e9a184..e9c636f57777 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1497,7 +1497,7 @@ void folio_unlock(struct folio *folio) BUILD_BUG_ON(PG_waiters != 7); BUILD_BUG_ON(PG_locked > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (xor_unlock_is_negative_byte(1 << PG_locked, folio_flags(folio, 0))) + if (folio_xor_flags_has_waiters(folio, 1 << PG_locked)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_unlock); @@ -1518,9 +1518,17 @@ EXPORT_SYMBOL(folio_unlock); */ void folio_end_read(struct folio *folio, bool success) { + unsigned long mask = 1 << PG_locked; + + /* Must be in bottom byte for x86 to work */ + BUILD_BUG_ON(PG_uptodate > 7); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); + if (likely(success)) - folio_mark_uptodate(folio); - folio_unlock(folio); + mask |= 1 << PG_uptodate; + if (folio_xor_flags_has_waiters(folio, mask)) + folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_end_read); -- cgit From 7d0795d098a127508f3e29ab4257c9ab598efaea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:16 +0100 Subject: mm: make __end_folio_writeback() return void Rather than check the result of test-and-clear, just check that we have the writeback bit set at the start. This wouldn't catch every case, but it's good enough (and enables the next patch). Link: https://lkml.kernel.org/r/20231004165317.1061855-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- mm/filemap.c | 9 +++++++-- mm/internal.h | 2 +- mm/page-writeback.c | 38 ++++++++++++++++---------------------- 3 files changed, 24 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index e9c636f57777..523d9e15b3b0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1593,9 +1593,15 @@ EXPORT_SYMBOL(folio_wait_private_2_killable); /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. + * + * The folio must actually be under writeback. + * + * Context: May be called from process or interrupt context. */ void folio_end_writeback(struct folio *folio) { + VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); + /* * folio_test_clear_reclaim() could be used here but it is an * atomic operation and overkill in this particular case. Failing @@ -1615,8 +1621,7 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake(). */ folio_get(folio); - if (!__folio_end_writeback(folio)) - BUG(); + __folio_end_writeback(folio); smp_mb__after_atomic(); folio_wake(folio, PG_writeback); diff --git a/mm/internal.h b/mm/internal.h index 2b79da9deb64..ac4bbe2e2ca6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -105,7 +105,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); -bool __folio_end_writeback(struct folio *folio); +void __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 001adbb4a180..a37e4c33d1ab 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2940,11 +2940,10 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb) spin_unlock_irqrestore(&wb->work_lock, flags); } -bool __folio_end_writeback(struct folio *folio) +void __folio_end_writeback(struct folio *folio) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); - bool ret; folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { @@ -2953,19 +2952,16 @@ bool __folio_end_writeback(struct folio *folio) unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - ret = folio_test_clear_writeback(folio); - if (ret) { - __xa_clear_mark(&mapping->i_pages, folio_index(folio), - PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { - struct bdi_writeback *wb = inode_to_wb(inode); - - wb_stat_mod(wb, WB_WRITEBACK, -nr); - __wb_writeout_add(wb, nr); - if (!mapping_tagged(mapping, - PAGECACHE_TAG_WRITEBACK)) - wb_inode_writeback_end(wb); - } + folio_test_clear_writeback(folio); + __xa_clear_mark(&mapping->i_pages, folio_index(folio), + PAGECACHE_TAG_WRITEBACK); + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { + struct bdi_writeback *wb = inode_to_wb(inode); + + wb_stat_mod(wb, WB_WRITEBACK, -nr); + __wb_writeout_add(wb, nr); + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + wb_inode_writeback_end(wb); } if (mapping->host && !mapping_tagged(mapping, @@ -2974,15 +2970,13 @@ bool __folio_end_writeback(struct folio *folio) xa_unlock_irqrestore(&mapping->i_pages, flags); } else { - ret = folio_test_clear_writeback(folio); - } - if (ret) { - lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); - zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); - node_stat_mod_folio(folio, NR_WRITTEN, nr); + folio_test_clear_writeback(folio); } + + lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + node_stat_mod_folio(folio, NR_WRITTEN, nr); folio_memcg_unlock(folio); - return ret; } bool __folio_start_writeback(struct folio *folio, bool keep_write) -- cgit From 2580d554585c52a644839864ef9238af5b030ebc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:17 +0100 Subject: mm: use folio_xor_flags_has_waiters() in folio_end_writeback() Match how folio_unlock() works by combining the test for PG_waiters with the clearing of PG_writeback. This should have a small performance win, and removes the last user of folio_wake(). Link: https://lkml.kernel.org/r/20231004165317.1061855-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- mm/filemap.c | 15 +++------------ mm/internal.h | 2 +- mm/page-writeback.c | 9 ++++++--- 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 523d9e15b3b0..b04ba896aac9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1175,13 +1175,6 @@ static void folio_wake_bit(struct folio *folio, int bit_nr) spin_unlock_irqrestore(&q->lock, flags); } -static void folio_wake(struct folio *folio, int bit) -{ - if (!folio_test_waiters(folio)) - return; - folio_wake_bit(folio, bit); -} - /* * A choice of three behaviors for folio_wait_bit_common(): */ @@ -1618,13 +1611,11 @@ void folio_end_writeback(struct folio *folio) * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. * But here we must make sure that the folio is not freed and - * reused before the folio_wake(). + * reused before the folio_wake_bit(). */ folio_get(folio); - __folio_end_writeback(folio); - - smp_mb__after_atomic(); - folio_wake(folio, PG_writeback); + if (__folio_end_writeback(folio)) + folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); folio_put(folio); } diff --git a/mm/internal.h b/mm/internal.h index ac4bbe2e2ca6..2b79da9deb64 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -105,7 +105,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); -void __folio_end_writeback(struct folio *folio); +bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a37e4c33d1ab..46f2f5d3d183 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2940,10 +2940,11 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb) spin_unlock_irqrestore(&wb->work_lock, flags); } -void __folio_end_writeback(struct folio *folio) +bool __folio_end_writeback(struct folio *folio) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); + bool ret; folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { @@ -2952,7 +2953,7 @@ void __folio_end_writeback(struct folio *folio) unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - folio_test_clear_writeback(folio); + ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { @@ -2970,13 +2971,15 @@ void __folio_end_writeback(struct folio *folio) xa_unlock_irqrestore(&mapping->i_pages, flags); } else { - folio_test_clear_writeback(folio); + ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); } lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); node_stat_mod_folio(folio, NR_WRITTEN, nr); folio_memcg_unlock(folio); + + return ret; } bool __folio_start_writeback(struct folio *folio, bool keep_write) -- cgit From 59838b2566f6d03099743675a2e0f425813078c6 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Wed, 4 Oct 2023 15:32:48 +0000 Subject: mm, hugetlb: remove HUGETLB_CGROUP_MIN_ORDER Originally, hugetlb_cgroup was the only hugetlb user of tail page structure fields. So, the code defined and checked against HUGETLB_CGROUP_MIN_ORDER to make sure pages weren't too small to use. However, by now, tail page #2 is used to store hugetlb hwpoison and subpool information as well. In other words, without that tail page hugetlb doesn't work. Acknowledge this fact by getting rid of HUGETLB_CGROUP_MIN_ORDER and checks against it. Instead, just check for the minimum viable page order at hstate creation time. Link: https://lkml.kernel.org/r/20231004153248.3842997-1-fvdl@google.com Signed-off-by: Frank van der Linden Reviewed-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- mm/hugetlb_cgroup.c | 20 ++------------------ 2 files changed, 3 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7ad9d2159da4..e2b1c417b90a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4361,7 +4361,7 @@ void __init hugetlb_add_hstate(unsigned int order) return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order == 0); + BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); h = &hstates[hugetlb_max_hstate++]; mutex_init(&h->resize_lock); h->order = order; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index dedd2edb076e..aa4486bd3904 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -262,12 +262,6 @@ static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled()) goto done; - /* - * We don't charge any cgroup if the compound page have less - * than 3 pages. - */ - if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) - goto done; again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); @@ -397,9 +391,6 @@ static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled() || !h_cg) return; - if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) - return; - page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); @@ -869,15 +860,8 @@ void __init hugetlb_cgroup_file_init(void) { struct hstate *h; - for_each_hstate(h) { - /* - * Add cgroup control files only if the huge page consists - * of more than two normal pages. This is because we use - * page[2].private for storing cgroup details. - */ - if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) - __hugetlb_cgroup_file_init(hstate_index(h)); - } + for_each_hstate(h) + __hugetlb_cgroup_file_init(hstate_index(h)); } /* -- cgit From 4b569387c0d566db288e7c3e1b484b43df797bdb Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:26 -0700 Subject: memcontrol: add helpers for hugetlb memcg accounting Patch series "hugetlb memcg accounting", v4. Currently, hugetlb memory usage is not acounted for in the memory controller, which could lead to memory overprotection for cgroups with hugetlb-backed memory. This has been observed in our production system. For instance, here is one of our usecases: suppose there are two 32G containers. The machine is booted with hugetlb_cma=6G, and each container may or may not use up to 3 gigantic page, depending on the workload within it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup limit of each cgroup to 3G to enforce hugetlb fairness. But it is very difficult to configure memory.max to keep overall consumption, including anon, cache, slab etcetera fair. What we have had to resort to is to constantly poll hugetlb usage and readjust memory.max. Similar procedure is done to other memory limits (memory.low for e.g). However, this is rather cumbersome and buggy. Furthermore, when there is a delay in memory limits correction, (for e.g when hugetlb usage changes within consecutive runs of the userspace agent), the system could be in an over/underprotected state. This patch series rectifies this issue by charging the memcg when the hugetlb folio is allocated, and uncharging when the folio is freed. In addition, a new selftest is added to demonstrate and verify this new behavior. This patch (of 4): This patch exposes charge committing and cancelling as parts of the memory controller interface. These functionalities are useful when the try_charge() and commit_charge() stages have to be separated by other actions in between (which can fail). One such example is the new hugetlb accounting behavior in the following patch. The patch also adds a helper function to obtain a reference to the current task's memcg. Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton --- mm/memcontrol.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8539f2037168..b3e40a59f732 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1099,6 +1099,27 @@ static __always_inline bool memcg_kmem_bypass(void) return false; } +/** + * get_mem_cgroup_from_current - Obtain a reference on current task's memcg. + */ +struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return NULL; + +again: + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (!css_tryget(&memcg->css)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + return memcg; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -2873,7 +2894,12 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +/** + * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call. + * @memcg: memcg previously charged. + * @nr_pages: number of pages previously charged. + */ +void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) return; @@ -2898,6 +2924,22 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } +/** + * mem_cgroup_commit_charge - commit a previously successful try_charge(). + * @folio: folio to commit the charge to. + * @memcg: memcg previously charged. + */ +void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) +{ + css_get(&memcg->css); + commit_charge(folio, memcg); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio)); + memcg_check_events(memcg, folio_nid(folio)); + local_irq_enable(); +} + #ifdef CONFIG_MEMCG_KMEM /* * The allocated objcg pointers array is not accounted directly. @@ -6116,7 +6158,7 @@ static void __mem_cgroup_clear_mc(void) /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { - cancel_charge(mc.to, mc.precharge); + mem_cgroup_cancel_charge(mc.to, mc.precharge); mc.precharge = 0; } /* @@ -6124,7 +6166,7 @@ static void __mem_cgroup_clear_mc(void) * we must uncharge here. */ if (mc.moved_charge) { - cancel_charge(mc.from, mc.moved_charge); + mem_cgroup_cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; } /* we must fixup refcnts and charges */ @@ -7031,20 +7073,13 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, gfp_t gfp) { - long nr_pages = folio_nr_pages(folio); int ret; - ret = try_charge(memcg, gfp, nr_pages); + ret = try_charge(memcg, gfp, folio_nr_pages(folio)); if (ret) goto out; - css_get(&memcg->css); - commit_charge(folio, memcg); - - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, nr_pages); - memcg_check_events(memcg, folio_nid(folio)); - local_irq_enable(); + mem_cgroup_commit_charge(folio, memcg); out: return ret; } -- cgit From 85ce2c517ade0d51b7ad95f2e88be9bbe294379a Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:27 -0700 Subject: memcontrol: only transfer the memcg data for migration For most migration use cases, only transfer the memcg data from the old folio to the new folio, and clear the old folio's memcg data. No charging and uncharging will be done. This shaves off some work on the migration path, and avoids the temporary double charging of a folio during its migration. The only exception is replace_page_cache_folio(), which will use the old mem_cgroup_migrate() (now renamed to mem_cgroup_replace_folio). In that context, the isolation of the old page isn't quite as thorough as with migration, so we cannot use our new implementation directly. This patch is the result of the following discussion on the new hugetlb memcg accounting behavior: https://lore.kernel.org/lkml/20231003171329.GB314430@monkey/ Link: https://lkml.kernel.org/r/20231006184629.155543-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- mm/memcontrol.c | 40 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index b04ba896aac9..48cd16c54e86 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -816,7 +816,7 @@ void replace_page_cache_folio(struct folio *old, struct folio *new) new->mapping = mapping; new->index = offset; - mem_cgroup_migrate(old, new); + mem_cgroup_replace_folio(old, new); xas_lock_irq(&xas); xas_store(&xas, new); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b3e40a59f732..135d6637afe5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7292,16 +7292,17 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_migrate - Charge a folio's replacement. + * mem_cgroup_replace_folio - Charge a folio's replacement. * @old: Currently circulating folio. * @new: Replacement folio. * * Charge @new as a replacement folio for @old. @old will - * be uncharged upon free. + * be uncharged upon free. This is only used by the page cache + * (in replace_page_cache_folio()). * * Both folios must be locked, @new->mapping must be set up. */ -void mem_cgroup_migrate(struct folio *old, struct folio *new) +void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; long nr_pages = folio_nr_pages(new); @@ -7340,6 +7341,39 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) local_irq_restore(flags); } +/** + * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio. + * @old: Currently circulating folio. + * @new: Replacement folio. + * + * Transfer the memcg data from the old folio to the new folio for migration. + * The old folio's data info will be cleared. Note that the memory counters + * will remain unchanged throughout the process. + * + * Both folios must be locked, @new->mapping must be set up. + */ +void mem_cgroup_migrate(struct folio *old, struct folio *new) +{ + struct mem_cgroup *memcg; + + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); + VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new); + + if (mem_cgroup_disabled()) + return; + + memcg = folio_memcg(old); + VM_WARN_ON_ONCE_FOLIO(!memcg, old); + if (!memcg) + return; + + /* Transfer the charge and the css ref */ + commit_charge(new, memcg); + old->memcg_data = 0; +} + DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); -- cgit From 8cba9576df601c384abd334a503c3f6e1e29eefb Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:28 -0700 Subject: hugetlb: memcg: account hugetlb-backed memory in memory controller Currently, hugetlb memory usage is not acounted for in the memory controller, which could lead to memory overprotection for cgroups with hugetlb-backed memory. This has been observed in our production system. For instance, here is one of our usecases: suppose there are two 32G containers. The machine is booted with hugetlb_cma=6G, and each container may or may not use up to 3 gigantic page, depending on the workload within it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup limit of each cgroup to 3G to enforce hugetlb fairness. But it is very difficult to configure memory.max to keep overall consumption, including anon, cache, slab etc. fair. What we have had to resort to is to constantly poll hugetlb usage and readjust memory.max. Similar procedure is done to other memory limits (memory.low for e.g). However, this is rather cumbersome and buggy. Furthermore, when there is a delay in memory limits correction, (for e.g when hugetlb usage changes within consecutive runs of the userspace agent), the system could be in an over/underprotected state. This patch rectifies this issue by charging the memcg when the hugetlb folio is utilized, and uncharging when the folio is freed (analogous to the hugetlb controller). Note that we do not charge when the folio is allocated to the hugetlb pool, because at this point it is not owned by any memcg. Some caveats to consider: * This feature is only available on cgroup v2. * There is no hugetlb pool management involved in the memory controller. As stated above, hugetlb folios are only charged towards the memory controller when it is used. Host overcommit management has to consider it when configuring hard limits. * Failure to charge towards the memcg results in SIGBUS. This could happen even if the hugetlb pool still has pages (but the cgroup limit is hit and reclaim attempt fails). * When this feature is enabled, hugetlb pages contribute to memory reclaim protection. low, min limits tuning must take into account hugetlb memory. * Hugetlb pages utilized while this option is not selected will not be tracked by the memory controller (even if cgroup v2 is remounted later on). Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton --- mm/hugetlb.c | 35 ++++++++++++++++++++++++++++------- mm/memcontrol.c | 42 +++++++++++++++++++++++++++++++++++++++++- mm/migrate.c | 3 +-- 3 files changed, 70 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e2b1c417b90a..da6f85b7db88 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1927,6 +1927,7 @@ void free_huge_folio(struct folio *folio) pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; @@ -3026,11 +3027,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit; + long map_chg, map_commit, nr_pages = pages_per_huge_page(h); long gbl_chg; - int ret, idx; + int memcg_charge_ret, ret, idx; struct hugetlb_cgroup *h_cg = NULL; + struct mem_cgroup *memcg; bool deferred_reserve; + gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; + + memcg = get_mem_cgroup_from_current(); + memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); + if (memcg_charge_ret == -ENOMEM) { + mem_cgroup_put(memcg); + return ERR_PTR(-ENOMEM); + } idx = hstate_index(h); /* @@ -3039,8 +3049,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * code of zero indicates a reservation exists (no change). */ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) + if (map_chg < 0) { + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOMEM); + } /* * Processes that did not create the mapping will have no @@ -3051,10 +3065,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg || avoid_reserve) { gbl_chg = hugepage_subpool_get_pages(spool, 1); - if (gbl_chg < 0) { - vma_end_reservation(h, vma, addr); - return ERR_PTR(-ENOSPC); - } + if (gbl_chg < 0) + goto out_end_reservation; /* * Even though there was no reservation in the region/reserve @@ -3136,6 +3148,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } + + if (!memcg_charge_ret) + mem_cgroup_commit_charge(folio, memcg); + mem_cgroup_put(memcg); + return folio; out_uncharge_cgroup: @@ -3147,7 +3164,11 @@ out_uncharge_cgroup_reservation: out_subpool_put: if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); +out_end_reservation: vma_end_reservation(h, vma, addr); + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOSPC); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 135d6637afe5..a86e7b445800 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7096,6 +7096,41 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) return ret; } +/** + * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio + * @memcg: memcg to charge. + * @gfp: reclaim mode. + * @nr_pages: number of pages to charge. + * + * This function is called when allocating a huge page folio to determine if + * the memcg has the capacity for it. It does not commit the charge yet, + * as the hugetlb folio itself has not been obtained from the hugetlb pool. + * + * Once we have obtained the hugetlb folio, we can call + * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the + * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect + * of try_charge(). + * + * Returns 0 on success. Otherwise, an error code is returned. + */ +int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, + long nr_pages) +{ + /* + * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, + * but do not attempt to commit charge later (or cancel on error) either. + */ + if (mem_cgroup_disabled() || !memcg || + !cgroup_subsys_on_dfl(memory_cgrp_subsys) || + !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + return -EOPNOTSUPP; + + if (try_charge(memcg, gfp, nr_pages)) + return -ENOMEM; + + return 0; +} + /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. * @folio: folio to charge. @@ -7365,7 +7400,12 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) return; memcg = folio_memcg(old); - VM_WARN_ON_ONCE_FOLIO(!memcg, old); + /* + * Note that it is normal to see !memcg for a hugetlb folio. + * For e.g, itt could have been allocated when memory_hugetlb_accounting + * was not selected. + */ + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); if (!memcg) return; diff --git a/mm/migrate.c b/mm/migrate.c index 9d9eee5322d1..c602bf6dec97 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -633,8 +633,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_copy_owner(newfolio, folio); - if (!folio_test_hugetlb(folio)) - mem_cgroup_migrate(folio, newfolio); + mem_cgroup_migrate(folio, newfolio); } EXPORT_SYMBOL(folio_migrate_flags); -- cgit From 7a81751fcdeb833acc858e59082688e3020bfe12 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Mon, 25 Sep 2023 13:01:10 -0700 Subject: mm/thp: fix "mm: thp: kill __transhuge_page_enabled()" The 6.0 commits: commit 9fec51689ff6 ("mm: thp: kill transparent_hugepage_active()") commit 7da4e2cb8b1f ("mm: thp: kill __transhuge_page_enabled()") merged "can we have THPs in this VMA?" logic that was previously done separately by fault-path, khugepaged, and smaps "THPeligible" checks. During the process, the semantics of the fault path check changed in two ways: 1) A VM_NO_KHUGEPAGED check was introduced (also added to smaps path). 2) We no longer checked if non-anonymous memory had a vm_ops->huge_fault handler that could satisfy the fault. Previously, this check had been done in create_huge_pud() and create_huge_pmd() routines, but after the changes, we never reach those routines. During the review of the above commits, it was determined that in-tree users weren't affected by the change; most notably, since the only relevant user (in terms of THP) of VM_MIXEDMAP or ->huge_fault is DAX, which is explicitly approved early in approval logic. However, this was a bad assumption to make as it assumes the only reason to support ->huge_fault was for DAX (which is not true in general). Remove the VM_NO_KHUGEPAGED check when not in collapse path and give any ->huge_fault handler a chance to handle the fault. Note that we don't validate the file mode or mapping alignment, which is consistent with the behavior before the aforementioned commits. Link: https://lkml.kernel.org/r/20230925200110.1979606-1-zokeefe@google.com Fixes: 7da4e2cb8b1f ("mm: thp: kill __transhuge_page_enabled()") Reported-by: Saurabh Singh Sengar Signed-off-by: Zach O'Keefe Cc: Yang Shi Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 51a66eb48938..c9cbcbf6697e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -100,11 +100,11 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, return in_pf; /* - * Special VMA and hugetlb VMA. + * khugepaged special VMA and hugetlb VMA. * Must be checked after dax since some dax mappings may have * VM_MIXEDMAP set. */ - if (vm_flags & VM_NO_KHUGEPAGED) + if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) return false; /* @@ -132,12 +132,18 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, !hugepage_flags_always()))) return false; - /* Only regular file is valid */ - if (!in_pf && file_thp_enabled(vma)) - return true; - - if (!vma_is_anonymous(vma)) + if (!vma_is_anonymous(vma)) { + /* + * Trust that ->huge_fault() handlers know what they are doing + * in fault path. + */ + if (((in_pf || smaps)) && vma->vm_ops->huge_fault) + return true; + /* Only regular file is valid in collapse path */ + if (((!in_pf || smaps)) && file_thp_enabled(vma)) + return true; return false; + } if (vma_is_temporary_stack(vma)) return false; -- cgit From f04eba134e598d4a5af2eeecff0562df5042cbfc Mon Sep 17 00:00:00 2001 From: Lucy Mielke Date: Fri, 6 Oct 2023 22:30:51 +0200 Subject: mm: add printf attribute to shrinker_debugfs_name_alloc This fixes a compiler warning when compiling an allyesconfig with W=1: mm/internal.h:1235:9: error: function might be a candidate for `gnu_printf' format attribute [-Werror=suggest-attribute=format] [akpm@linux-foundation.org: fix shrinker_alloc() as welll per Qi Zheng] Link: https://lkml.kernel.org/r/822387b7-4895-4e64-5806-0f56b5d6c447@bytedance.com Link: https://lkml.kernel.org/r/ZSBue-3kM6gI6jCr@mainframe Fixes: c42d50aefd17 ("mm: shrinker: add infrastructure for dynamically allocating shrinker") Signed-off-by: Lucy Mielke Cc: Qi Zheng Signed-off-by: Andrew Morton --- mm/internal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 2b79da9deb64..d1f7a3c612fe 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1219,8 +1219,8 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG -static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, - const char *fmt, va_list ap) +static inline __printf(2, 0) int shrinker_debugfs_name_alloc( + struct shrinker *shrinker, const char *fmt, va_list ap) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); -- cgit From 9b91432985851e5d55a41478bc0ecf93bf746981 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 10 Oct 2023 08:25:02 +0100 Subject: mm/mprotect: allow unfaulted VMAs to be unaccounted on mprotect() When mprotect() is used to make unwritable VMAs writable, they have the VM_ACCOUNT flag applied and memory accounted accordingly. If the VMA has had no pages faulted in and is then made unwritable once again, it will remain accounted for, despite not being capable of extending memory usage. Consider:- ptr = mmap(NULL, page_size * 3, PROT_READ, MAP_ANON | MAP_PRIVATE, -1, 0); mprotect(ptr + page_size, page_size, PROT_READ | PROT_WRITE); mprotect(ptr + page_size, page_size, PROT_READ); The first mprotect() splits the range into 3 VMAs and the second fails to merge the three as the middle VMA has VM_ACCOUNT set and the others do not, rendering them unmergeable. This is unnecessary, since no pages have actually been allocated and the middle VMA is not capable of utilising more memory, thereby introducing unnecessary VMA fragmentation (and accounting for more memory than is necessary). Since we cannot efficiently determine which pages map to an anonymous VMA, we have to be very conservative - determining whether any pages at all have been faulted in, by checking whether vma->anon_vma is NULL. We can see that the lack of anon_vma implies that no anonymous pages are present as evidenced by vma_needs_copy() utilising this on fork to determine whether page tables need to be copied. The only place where anon_vma is set NULL explicitly is on fork with VM_WIPEONFORK set, however since this flag is intended to cause the child process to not CoW on a given memory range, it is right to interpret this as indicating the VMA has no faulted-in anonymous memory mapped. If the VMA was forked without VM_WIPEONFORK set, then anon_vma_fork() will have ensured that a new anon_vma is assigned (and correctly related to its parent anon_vma) should any pages be CoW-mapped. The overall operation is safe against races as we hold a write lock against mm->mmap_lock. If we could efficiently look up the VMA's faulted-in pages then we would unaccount all those pages not yet faulted in. However as the original comment alludes this simply isn't currently possible, so we are conservative and account all pages or none at all. Link: https://lkml.kernel.org/r/ad5540371a16623a069f03f4db1739f33cde1fab.1696921767.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (IBM) Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mprotect.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index b94fbb45d5c7..03e2cec3e669 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -608,8 +608,11 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we - * make it unwritable again. hugetlb mapping were accounted for - * even if read-only so there is no need to account for them here + * make it unwritable again except in the anonymous case where no + * anon_vma has yet to be assigned. + * + * hugetlb mapping were accounted for even if read-only so there is + * no need to account for them here. */ if (newflags & VM_WRITE) { /* Check space limits when area turns into data. */ @@ -623,6 +626,9 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, return -ENOMEM; newflags |= VM_ACCOUNT; } + } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) && + !vma->anon_vma) { + newflags &= ~VM_ACCOUNT; } /* @@ -665,6 +671,9 @@ success: change_protection(tlb, vma, start, end, mm_cp_flags); + if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT)) + vm_unacct_memory(nrpages); + /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major * fault on access. -- cgit From b0b598ee08f9759b70971e297cf7ddd3eaaa5245 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 10 Oct 2023 04:58:28 +0100 Subject: filemap: remove use of wait bookmarks The original problem of the overly long list of waiters on a locked page was solved properly by commit 9a1ea439b16b ("mm: put_and_wait_on_page_locked() while page is migrated"). In the meantime, using bookmarks for the writeback bit can cause livelocks, so we need to stop using them. Link: https://lkml.kernel.org/r/20231010035829.544242-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Bin Lai Cc: Benjamin Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt (Google) Cc: Valentin Schneider Cc: Vincent Guittot Cc: Ingo Molnar Signed-off-by: Andrew Morton --- mm/filemap.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 48cd16c54e86..9ef49255f1a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1133,32 +1133,13 @@ static void folio_wake_bit(struct folio *folio, int bit_nr) wait_queue_head_t *q = folio_waitqueue(folio); struct wait_page_key key; unsigned long flags; - wait_queue_entry_t bookmark; key.folio = folio; key.bit_nr = bit_nr; key.page_match = 0; - bookmark.flags = 0; - bookmark.private = NULL; - bookmark.func = NULL; - INIT_LIST_HEAD(&bookmark.entry); - spin_lock_irqsave(&q->lock, flags); - __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); - - while (bookmark.flags & WQ_FLAG_BOOKMARK) { - /* - * Take a breather from holding the lock, - * allow pages that finish wake up asynchronously - * to acquire the lock and remove themselves - * from wait queue - */ - spin_unlock_irqrestore(&q->lock, flags); - cpu_relax(); - spin_lock_irqsave(&q->lock, flags); - __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); - } + __wake_up_locked_key(q, TASK_NORMAL, &key); /* * It's possible to miss clearing waiters here, when we woke our page -- cgit From 94d7d923395129b9248777e575c877e40007f9dc Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:28 +0100 Subject: mm: abstract the vma_merge()/split_vma() pattern for mprotect() et al. mprotect() and other functions which change VMA parameters over a range each employ a pattern of:- 1. Attempt to merge the range with adjacent VMAs. 2. If this fails, and the range spans a subset of the VMA, split it accordingly. This is open-coded and duplicated in each case. Also in each case most of the parameters passed to vma_merge() remain the same. Create a new function, vma_modify(), which abstracts this operation, accepting only those parameters which can be changed. To avoid the mess of invoking each function call with unnecessary parameters, create inline wrapper functions for each of the modify operations, parameterised only by what is required to perform the action. We can also significantly simplify the logic - by returning the VMA if we split (or merged VMA if we do not) we no longer need specific handling for merge/split cases in any of the call sites. Note that the userfaultfd_release() case works even though it does not split VMAs - since start is set to vma->vm_start and end is set to vma->vm_end, the split logic does not trigger. In addition, since we calculate pgoff to be equal to vma->vm_pgoff + (start - vma->vm_start) >> PAGE_SHIFT, and start - vma->vm_start will be 0 in this instance, this invocation will remain unchanged. We eliminate a VM_WARN_ON() in mprotect_fixup() as this simply asserts that vma_merge() correctly ensures that flags remain the same, something that is already checked in is_mergeable_vma() and elsewhere, and in any case is not specific to mprotect(). Link: https://lkml.kernel.org/r/0dfa9368f37199a423674bf0ee312e8ea0619044.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/madvise.c | 26 ++++---------------------- mm/mempolicy.c | 26 +++----------------------- mm/mlock.c | 25 ++++--------------------- mm/mmap.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ mm/mprotect.c | 29 ++++------------------------- 5 files changed, 63 insertions(+), 91 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 59e8860c86af..a6b48d4796ba 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -141,7 +141,6 @@ static int madvise_update_vma(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; int error; - pgoff_t pgoff; VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { @@ -149,30 +148,13 @@ static int madvise_update_vma(struct vm_area_struct *vma, return 0; } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_name); - if (*prev) { - vma = *prev; - goto success; - } + vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, + anon_name); + if (IS_ERR(vma)) + return PTR_ERR(vma); *prev = vma; - if (start != vma->vm_start) { - error = split_vma(&vmi, vma, start, 1); - if (error) - return error; - } - - if (end != vma->vm_end) { - error = split_vma(&vmi, vma, end, 0); - if (error) - return error; - } - -success: /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 38a47fa33ef4..8bb5b4b4b395 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -811,10 +811,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - struct vm_area_struct *merged; unsigned long vmstart, vmend; - pgoff_t pgoff; - int err; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { @@ -829,26 +826,9 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } - pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, new_pol, - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (merged) { - *prev = merged; - return vma_replace_policy(merged, new_pol); - } - - if (vma->vm_start != vmstart) { - err = split_vma(vmi, vma, vmstart, 1); - if (err) - return err; - } - - if (vma->vm_end != vmend) { - err = split_vma(vmi, vma, vmend, 0); - if (err) - return err; - } + vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); + if (IS_ERR(vma)) + return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); diff --git a/mm/mlock.c b/mm/mlock.c index 42b6865f8f82..aa44456200e3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -476,7 +476,6 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - pgoff_t pgoff; int nr_pages; int ret = 0; vm_flags_t oldflags = vma->vm_flags; @@ -487,28 +486,12 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(vmi, mm, *prev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (*prev) { - vma = *prev; - goto success; - } - - if (start != vma->vm_start) { - ret = split_vma(vmi, vma, start, 1); - if (ret) - goto out; - } - - if (end != vma->vm_end) { - ret = split_vma(vmi, vma, end, 0); - if (ret) - goto out; + vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; } -success: /* * Keep track of amount of locked VM. */ diff --git a/mm/mmap.c b/mm/mmap.c index 673429ee8a9e..bca685820763 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2437,6 +2437,54 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } +/* + * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd + * context and anonymous VMA name within the range [start, end). + * + * As a result, we might be able to merge the newly modified VMA range with an + * adjacent VMA with identical properties. + * + * If no merge is possible and the range does not span the entirety of the VMA, + * we then need to split the VMA to accommodate the change. + * + * The function returns either the merged VMA, the original VMA if a split was + * required instead, or an error if the split failed. + */ +struct vm_area_struct *vma_modify(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long vm_flags, + struct mempolicy *policy, + struct vm_userfaultfd_ctx uffd_ctx, + struct anon_vma_name *anon_name) +{ + pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + struct vm_area_struct *merged; + + merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags, + vma->anon_vma, vma->vm_file, pgoff, policy, + uffd_ctx, anon_name); + if (merged) + return merged; + + if (vma->vm_start < start) { + int err = split_vma(vmi, vma, start, 1); + + if (err) + return ERR_PTR(err); + } + + if (vma->vm_end > end) { + int err = split_vma(vmi, vma, end, 0); + + if (err) + return ERR_PTR(err); + } + + return vma; +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator diff --git a/mm/mprotect.c b/mm/mprotect.c index 03e2cec3e669..f1dc8f8c84ef 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -581,7 +581,6 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; - pgoff_t pgoff; int error; if (newflags == oldflags) { @@ -631,34 +630,14 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, newflags &= ~VM_ACCOUNT; } - /* - * First try to merge with previous and/or next vma. - */ - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (*pprev) { - vma = *pprev; - VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); - goto success; + vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags); + if (IS_ERR(vma)) { + error = PTR_ERR(vma); + goto fail; } *pprev = vma; - if (start != vma->vm_start) { - error = split_vma(vmi, vma, start, 1); - if (error) - goto fail; - } - - if (end != vma->vm_end) { - error = split_vma(vmi, vma, end, 0); - if (error) - goto fail; - } - -success: /* * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. -- cgit From adb20b0c785e9e8d5e4d4a07b9c3340d7de0e5dc Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:29 +0100 Subject: mm: make vma_merge() and split_vma() internal Now the common pattern of - attempting a merge via vma_merge() and should this fail splitting VMAs via split_vma() - has been abstracted, the former can be placed into mm/internal.h and the latter made static. In addition, the split_vma() nommu variant also need not be exported. Link: https://lkml.kernel.org/r/405f2be10e20c4e9fbcc9fe6b2dfea105f6642e0.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 9 +++++++++ mm/mmap.c | 8 ++++---- mm/nommu.c | 4 ++-- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index d1f7a3c612fe..2ab81c68be6c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1002,6 +1002,15 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); +/* + * mm/mmap.c + */ +struct vm_area_struct *vma_merge(struct vma_iterator *vmi, + struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, struct anon_vma *, + struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, + struct anon_vma_name *); + enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, diff --git a/mm/mmap.c b/mm/mmap.c index bca685820763..a516f2412f79 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2346,8 +2346,8 @@ static void unmap_region(struct mm_struct *mm, struct ma_state *mas, * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the end VMA. */ -int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; @@ -2428,8 +2428,8 @@ out_free_vma: * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ -int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; diff --git a/mm/nommu.c b/mm/nommu.c index f9553579389b..fc4afe924ad5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1305,8 +1305,8 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; -- cgit From 4b5f2d20169827add8875e78a352cf956ccdd55a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:30 +0100 Subject: mm: abstract merge for new VMAs into vma_merge_new_vma() Only in mmap_region() and copy_vma() do we attempt to merge VMAs which occupy entirely new regions of virtual memory. We can abstract this logic and make the intent of this invocations of it completely explicit, rather than invoking vma_merge() with an inscrutable wall of parameters. This also paves the way for a simplification of the core vma_merge() implementation, as we seek to make it entirely an implementation detail. The VMA merge call in mmap_region() occurs only for file-backed mappings, where each of the parameters previously specified as NULL are defaulted to NULL in vma_init() (called by vm_area_alloc()). This matches the previous behaviour of specifying NULL for a number of fields, however note that prior to this call we pass the VMA to the file system driver via call_mmap(), which may in theory adjust fields that we pass in to vma_merge_new_vma(). Therefore we actually resolve an oversight here by allowing for the fact that the driver may have done this. Link: https://lkml.kernel.org/r/3dc71d17e307756a54781d4a4ce7315cf8b18bea.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index a516f2412f79..e5e50e547ebf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2485,6 +2485,20 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi, return vma; } +/* + * Attempt to merge a newly mapped VMA with those adjacent to it. The caller + * must ensure that [start, end) does not overlap any existing VMA. + */ +static struct vm_area_struct +*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff) +{ + return vma_merge(vmi, vma->vm_mm, prev, start, end, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2840,10 +2854,9 @@ cannot_expand: * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(&vmi, mm, prev, vma->vm_start, - vma->vm_end, vma->vm_flags, NULL, - vma->vm_file, vma->vm_pgoff, NULL, - NULL_VM_UFFD_CTX, NULL); + merge = vma_merge_new_vma(&vmi, prev, vma, + vma->vm_start, vma->vm_end, + vma->vm_pgoff); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -3385,9 +3398,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); if (new_vma) { /* * Source vma may have been merged into new_vma -- cgit From 93bf5d4aa27d4ba528f71483ae51fbd70edb3ce8 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:31 +0100 Subject: mm: abstract VMA merge and extend into vma_merge_extend() helper mremap uses vma_merge() in the case where a VMA needs to be extended. This can be significantly simplified and abstracted. This makes it far easier to understand what the actual function is doing, avoids future mistakes in use of the confusing vma_merge() function and importantly allows us to make future changes to how vma_merge() is implemented by knowing explicitly which merge cases each invocation uses. Note that in the mremap() extend case, we perform this merge only when old_len == vma->vm_end - addr. The extension_start, i.e. the start of the extended portion of the VMA is equal to addr + old_len, i.e. vma->vm_end. With this refactoring, vma_merge() is no longer required anywhere except mm/mmap.c, so mark it static. Link: https://lkml.kernel.org/r/f16cbdc2e72d37a1a097c39dc7d1fee8919a1c93.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 8 +++----- mm/mmap.c | 31 ++++++++++++++++++++++++------- mm/mremap.c | 30 +++++++++++++----------------- 3 files changed, 40 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 2ab81c68be6c..c61a98d3b3c7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1005,11 +1005,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, /* * mm/mmap.c */ -struct vm_area_struct *vma_merge(struct vma_iterator *vmi, - struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, struct anon_vma *, - struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, - struct anon_vma_name *); +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta); enum { /* mark page accessed */ diff --git a/mm/mmap.c b/mm/mmap.c index e5e50e547ebf..3ea52451623b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -860,13 +860,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * **** is not represented - it will be merged and the vma containing the * area is returned, or the function will return NULL */ -struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) +static struct vm_area_struct +*vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, unsigned long end, + unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; @@ -2499,6 +2499,23 @@ static struct vm_area_struct vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } +/* + * Expand vma by delta bytes, potentially merging with an immediately adjacent + * VMA with identical properties. + */ +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta) +{ + pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); + + /* vma is specified as prev, so case 1 or 2 will apply. */ + return vma_merge(vmi, vma->vm_mm, vma, vma->vm_end, vma->vm_end + delta, + vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, + vma_policy(vma), vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator diff --git a/mm/mremap.c b/mm/mremap.c index ce8a23ef325a..38d98465f3d8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1096,14 +1096,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* old_len exactly to the end of the area.. */ if (old_len == vma->vm_end - addr) { + unsigned long delta = new_len - old_len; + /* can we just expand the current mapping? */ - if (vma_expandable(vma, new_len - old_len)) { - long pages = (new_len - old_len) >> PAGE_SHIFT; - unsigned long extension_start = addr + old_len; - unsigned long extension_end = addr + new_len; - pgoff_t extension_pgoff = vma->vm_pgoff + - ((extension_start - vma->vm_start) >> PAGE_SHIFT); - VMA_ITERATOR(vmi, mm, extension_start); + if (vma_expandable(vma, delta)) { + long pages = delta >> PAGE_SHIFT; + VMA_ITERATOR(vmi, mm, vma->vm_end); long charged = 0; if (vma->vm_flags & VM_ACCOUNT) { @@ -1115,17 +1113,15 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* - * Function vma_merge() is called on the extension we - * are adding to the already existing vma, vma_merge() - * will merge this extension with the already existing - * vma (expand operation itself) and possibly also with - * the next vma if it becomes adjacent to the expanded - * vma and otherwise compatible. + * Function vma_merge_extend() is called on the + * extension we are adding to the already existing vma, + * vma_merge_extend() will merge this extension with the + * already existing vma (expand operation itself) and + * possibly also with the next vma if it becomes + * adjacent to the expanded vma and otherwise + * compatible. */ - vma = vma_merge(&vmi, mm, vma, extension_start, - extension_end, vma->vm_flags, vma->anon_vma, - vma->vm_file, extension_pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma = vma_merge_extend(&vmi, vma, delta); if (!vma) { vm_unacct_memory(charged); ret = -ENOMEM; -- cgit From b459f0905eec31196b6e841773aabe3194e3c3fe Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Fri, 13 Oct 2023 15:03:45 -0400 Subject: mm/page_owner: remove free_ts from page_owner output Patch series "Fix page_owner's use of free timestamps". While page ower output is used to investigate memory utilization, typically the allocation pathway, the introduction of timestamps to the page owner records caused each record to become unique due to the granularity of the nanosecond timestamp (for example): Page allocated via order 0 ... ts 5206196026 ns, free_ts 5187156703 ns Page allocated via order 0 ... ts 5206198540 ns, free_ts 5187162702 ns Furthermore, the page_owner output only dumps the currently allocated records, so having the free timestamps is nonsensical for the typical use case. In addition, the introduction of timestamps was not properly handled in the page_owner_sort tool causing most use cases to be broken. This series is meant to remove the free timestamps from the page_owner output and fix the page_owner_sort tool so proper collation can occur. This patch (of 5): When printing page_owner data via the sysfs interface, no free pages will ever be dumped due to the series of checks in read_page_owner(): /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) The free_ts values are still used when dump_page_owner() is called, so keeping the field for other use cases but removing them for the typical page_owner case. Link: https://lkml.kernel.org/r/20231013190350.579407-1-audra@redhat.com Link: https://lkml.kernel.org/r/20231013190350.579407-2-audra@redhat.com Fixes: 866b48526217 ("mm/page_owner: record the timestamp of all pages during free") Signed-off-by: Audra Mitchell Acked-by: Rafael Aquini Reviewed-by: Vlastimil Babka Cc: Georgi Djakov Signed-off-by: Andrew Morton --- mm/page_owner.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index 4e2723e1b300..4f13ce7d2452 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -408,11 +408,11 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = scnprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, page_owner->tgid, page_owner->comm, - page_owner->ts_nsec, page_owner->free_ts_nsec); + page_owner->ts_nsec); /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); -- cgit From 4d4e41b682990b1dc5bba2bc313800340bf5c2d4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 12 Oct 2023 19:22:53 +0000 Subject: mm/damon/sysfs-schemes: do not update tried regions more than one DAMON snapshot Patch series "mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval". DAMOS tried regions update feature of DAMON sysfs interface is doing the update for one aggregation interval after the request is made. Since the per-scheme apply interval is supported, that behavior makes no much sense. That is, the tried regions directory will have regions from multiple DAMON monitoring results snapshots, or no region for apply intervals that much shorter than, or longer than the aggregation interval, respectively. Update the behavior to update the regions for each scheme for only its apply interval, and update the document. Since DAMOS apply interval is the aggregation by default, this change makes no visible behavioral difference to old users who don't explicitly set the apply intervals. Patches Sequence ---------------- The first two patches makes schemes of apply intervals that much shorter or longer than the aggregation interval to keep the maximum and minimum times for continuing the update. After the two patches, the update aligns with the each scheme's apply interval. Finally, the third patch updates the document to reflect the behavior. This patch (of 3): DAMON_SYSFS exposes every DAMON-found region that eligible for applying the scheme action for one aggregation interval. However, each DAMON-based operation scheme has its own apply interval. Hence, for a scheme that having its apply interval much smaller than the aggregation interval, DAMON_SYSFS will expose the scheme regions that applied to more than one DAMON monitoring results snapshots. Since the purpose of DAMON tried regions is exposing single snapshot, this makes no much sense. Track progress of each scheme's tried regions update and avoid the case. Link: https://lkml.kernel.org/r/20231012192256.33556-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231012192256.33556-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'mm') diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index a7d70b95c4dd..b07a5c544b34 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -113,11 +113,47 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = { * scheme regions directory */ +/* + * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update + * status + * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request. + * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started. + * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished. + * + * Each DAMON-based operation scheme (&struct damos) has its own apply + * interval, and we need to expose the scheme tried regions based on only + * single snapshot. For this, we keep the tried regions update status for each + * scheme. The status becomes 'idle' at the beginning. + * + * Once the tried regions update request is received, the request handling + * start function (damon_sysfs_scheme_update_regions_start()) sets the status + * of all schemes as 'idle' again, and register ->before_damos_apply() and + * ->after_sampling() callbacks. + * + * Then, the first followup ->before_damos_apply() callback + * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first + * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call + * is called only after the scheme is completely applied + * to the given snapshot. Hence the callback knows the situation by showing + * 'started' status, and sets the status as 'finished'. Then, + * damon_sysfs_before_damos_apply() understands the situation by showing the + * 'finished' status and do nothing. + * + * Finally, the tried regions request handling finisher function + * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks. + */ +enum damos_sysfs_regions_upd_status { + DAMOS_TRIED_REGIONS_UPD_IDLE, + DAMOS_TRIED_REGIONS_UPD_STARTED, + DAMOS_TRIED_REGIONS_UPD_FINISHED, +}; + struct damon_sysfs_scheme_regions { struct kobject kobj; struct list_head regions_list; int nr_regions; unsigned long total_bytes; + enum damos_sysfs_regions_upd_status upd_status; }; static struct damon_sysfs_scheme_regions * @@ -130,6 +166,7 @@ damon_sysfs_scheme_regions_alloc(void) INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; regions->total_bytes = 0; + regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; return regions; } @@ -1777,6 +1814,10 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED) + return 0; + if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE) + sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED; sysfs_regions->total_bytes += r->ar.end - r->ar.start; if (damos_regions_upd_total_bytes_only) return 0; @@ -1793,6 +1834,29 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; } +/* + * DAMON callback that called after each accesses sampling. While this + * callback is registered, damon_sysfs_lock should be held to ensure the + * regions directories exist. + */ +static int damon_sysfs_after_sampling(struct damon_ctx *ctx) +{ + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + struct damon_sysfs_scheme_regions *sysfs_regions; + int i; + + for (i = 0; i < sysfs_schemes->nr; i++) { + sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; + if (sysfs_regions->upd_status == + DAMOS_TRIED_REGIONS_UPD_STARTED) + sysfs_regions->upd_status = + DAMOS_TRIED_REGIONS_UPD_FINISHED; + } + + return 0; +} + /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_clear_regions( struct damon_sysfs_schemes *sysfs_schemes, @@ -1816,6 +1880,16 @@ int damon_sysfs_schemes_clear_regions( return 0; } +static void damos_tried_regions_init_upd_status( + struct damon_sysfs_schemes *sysfs_schemes) +{ + int i; + + for (i = 0; i < sysfs_schemes->nr; i++) + sysfs_schemes->schemes_arr[i]->tried_regions->upd_status = + DAMOS_TRIED_REGIONS_UPD_IDLE; +} + /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, @@ -1823,8 +1897,10 @@ int damon_sysfs_schemes_update_regions_start( { damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + damos_tried_regions_init_upd_status(sysfs_schemes); damos_regions_upd_total_bytes_only = total_bytes_only; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; + ctx->callback.after_sampling = damon_sysfs_after_sampling; return 0; } @@ -1837,6 +1913,7 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) { damon_sysfs_schemes_for_damos_callback = NULL; ctx->callback.before_damos_apply = NULL; + ctx->callback.after_sampling = NULL; damon_sysfs_schemes_region_idx = 0; return 0; } -- cgit From 76126332c7606ba25a4ae5db37145fd526985b45 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 12 Oct 2023 19:22:54 +0000 Subject: mm/damon/sysfs: avoid empty scheme tried regions for large apply interval DAMON_SYSFS assumes all schemes will be applied for at least one DAMON monitoring results snapshot within one aggregation interval, or makes no sense to wait for it while DAMON is deactivated by the watermarks. That for deactivated status still makes sense, but the aggregation interval based assumption is invalid now because each scheme can has its own apply interval. For schemes having larger than the aggregation or watermarks check interval, DAMOS tried regions update request can be finished without the update. Avoid the case by explicitly checking the status of the schemes tried regions update and watermarks based DAMON deactivation. Link: https://lkml.kernel.org/r/20231012192256.33556-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.h | 2 ++ mm/damon/sysfs-schemes.c | 16 ++++++++++++++++ mm/damon/sysfs.c | 34 ++++++++++++++++++++++++++++++---- 3 files changed, 48 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index fd482a0639b4..5ff081226e28 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx, bool total_bytes_only); +bool damos_sysfs_regions_upd_done(void); + int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); int damon_sysfs_schemes_clear_regions( diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index b07a5c544b34..45bd0fd4a8b1 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1904,6 +1904,22 @@ int damon_sysfs_schemes_update_regions_start( return 0; } +bool damos_sysfs_regions_upd_done(void) +{ + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + struct damon_sysfs_scheme_regions *sysfs_regions; + int i; + + for (i = 0; i < sysfs_schemes->nr; i++) { + sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; + if (sysfs_regions->upd_status != + DAMOS_TRIED_REGIONS_UPD_FINISHED) + return false; + } + return true; +} + /* * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller * should unlock damon_sysfs_lock which held before diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index f60e56150feb..f73dc88d2d19 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1336,12 +1336,13 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) /* * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. - * @c: The DAMON context of the callback. + * @c: The DAMON context of the callback. + * @active: Whether @c is not deactivated due to watermarks. * * This function is periodically called back from the kdamond thread for @c. * Then, it checks if there is a waiting DAMON sysfs request and handles it. */ -static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) +static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) { struct damon_sysfs_kdamond *kdamond; bool total_bytes_only = false; @@ -1373,6 +1374,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) goto keep_lock_out; } } else { + /* + * Continue regions updating if DAMON is till + * active and the update for all schemes is not + * finished. + */ + if (active && !damos_sysfs_regions_upd_done()) + goto keep_lock_out; err = damon_sysfs_upd_schemes_regions_stop(kdamond); damon_sysfs_schemes_regions_updating = false; } @@ -1392,6 +1400,24 @@ keep_lock_out: return err; } +static int damon_sysfs_after_wmarks_check(struct damon_ctx *c) +{ + /* + * after_wmarks_check() is called back while the context is deactivated + * by watermarks. + */ + return damon_sysfs_cmd_request_callback(c, false); +} + +static int damon_sysfs_after_aggregation(struct damon_ctx *c) +{ + /* + * after_aggregation() is called back only while the context is not + * deactivated by watermarks. + */ + return damon_sysfs_cmd_request_callback(c, true); +} + static struct damon_ctx *damon_sysfs_build_ctx( struct damon_sysfs_context *sys_ctx) { @@ -1407,8 +1433,8 @@ static struct damon_ctx *damon_sysfs_build_ctx( return ERR_PTR(err); } - ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback; - ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback; + ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check; + ctx->callback.after_aggregation = damon_sysfs_after_aggregation; ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; } -- cgit From e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Oct 2023 18:04:28 +0100 Subject: mm: drop the assumption that VM_SHARED always implies writable Patch series "permit write-sealed memfd read-only shared mappings", v4. The man page for fcntl() describing memfd file seals states the following about F_SEAL_WRITE:- Furthermore, trying to create new shared, writable memory-mappings via mmap(2) will also fail with EPERM. With emphasis on 'writable'. In turns out in fact that currently the kernel simply disallows all new shared memory mappings for a memfd with F_SEAL_WRITE applied, rendering this documentation inaccurate. This matters because users are therefore unable to obtain a shared mapping to a memfd after write sealing altogether, which limits their usefulness. This was reported in the discussion thread [1] originating from a bug report [2]. This is a product of both using the struct address_space->i_mmap_writable atomic counter to determine whether writing may be permitted, and the kernel adjusting this counter when any VM_SHARED mapping is performed and more generally implicitly assuming VM_SHARED implies writable. It seems sensible that we should only update this mapping if VM_MAYWRITE is specified, i.e. whether it is possible that this mapping could at any point be written to. If we do so then all we need to do to permit write seals to function as documented is to clear VM_MAYWRITE when mapping read-only. It turns out this functionality already exists for F_SEAL_FUTURE_WRITE - we can therefore simply adapt this logic to do the same for F_SEAL_WRITE. We then hit a chicken and egg situation in mmap_region() where the check for VM_MAYWRITE occurs before we are able to clear this flag. To work around this, perform this check after we invoke call_mmap(), with careful consideration of error paths. Thanks to Andy Lutomirski for the suggestion! [1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/ [2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238 This patch (of 3): There is a general assumption that VMAs with the VM_SHARED flag set are writable. If the VM_MAYWRITE flag is not set, then this is simply not the case. Update those checks which affect the struct address_space->i_mmap_writable field to explicitly test for this by introducing [vma_]is_shared_maywrite() helper functions. This remains entirely conservative, as the lack of VM_MAYWRITE guarantees that the VMA cannot be written to. Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Suggested-by: Andy Lutomirski Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- mm/madvise.c | 2 +- mm/mmap.c | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 9ef49255f1a5..9710f43a89ac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3618,7 +3618,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) */ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + if (vma_is_shared_maywrite(vma)) return -EINVAL; return generic_file_mmap(file, vma); } diff --git a/mm/madvise.c b/mm/madvise.c index a6b48d4796ba..cf4d694280e9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -970,7 +970,7 @@ static long madvise_remove(struct vm_area_struct *vma, return -EINVAL; } - if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) diff --git a/mm/mmap.c b/mm/mmap.c index 3ea52451623b..0041e3631f6c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -107,7 +107,7 @@ void vma_set_page_prot(struct vm_area_struct *vma) static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); @@ -384,7 +384,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); @@ -2846,7 +2846,7 @@ cannot_expand: vma->vm_pgoff = pgoff; if (file) { - if (vm_flags & VM_SHARED) { + if (is_shared_maywrite(vm_flags)) { error = mapping_map_writable(file->f_mapping); if (error) goto free_vma; @@ -2920,7 +2920,7 @@ cannot_expand: mm->map_count++; if (vma->vm_file) { i_mmap_lock_write(vma->vm_file->f_mapping); - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_allow_writable(vma->vm_file->f_mapping); flush_dcache_mmap_lock(vma->vm_file->f_mapping); @@ -2937,7 +2937,7 @@ cannot_expand: /* Once vma denies write, undo our temporary denial count */ unmap_writable: - if (file && vm_flags & VM_SHARED) + if (file && is_shared_maywrite(vm_flags)) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; ksm_add_vma(vma); @@ -2985,7 +2985,7 @@ unmap_and_free_vma: unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, vma->vm_end, vma->vm_end, true); } - if (file && (vm_flags & VM_SHARED)) + if (file && is_shared_maywrite(vm_flags)) mapping_unmap_writable(file->f_mapping); free_vma: vm_area_free(vma); -- cgit From 28464bbb2ddc199433383994bcb9600c8034afa1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Oct 2023 18:04:29 +0100 Subject: mm: update memfd seal write check to include F_SEAL_WRITE The seal_check_future_write() function is called by shmem_mmap() or hugetlbfs_file_mmap() to disallow any future writable mappings of an memfd sealed this way. The F_SEAL_WRITE flag is not checked here, as that is handled via the mapping->i_mmap_writable mechanism and so any attempt at a mapping would fail before this could be run. However we intend to change this, meaning this check can be performed for F_SEAL_WRITE mappings also. The logic here is equally applicable to both flags, so update this function to accommodate both and rename it accordingly. Link: https://lkml.kernel.org/r/913628168ce6cce77df7d13a63970bae06a526e0.1697116581.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Andy Lutomirski Cc: Christian Brauner Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 61b170324e5c..bcbe9dbb9f5f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2378,7 +2378,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) struct shmem_inode_info *info = SHMEM_I(inode); int ret; - ret = seal_check_future_write(info->seals, vma); + ret = seal_check_write(info->seals, vma); if (ret) return ret; -- cgit From 158978945f3173b8c1a88f8c5684a629736a57ac Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Oct 2023 18:04:30 +0100 Subject: mm: perform the mapping_map_writable() check after call_mmap() In order for a F_SEAL_WRITE sealed memfd mapping to have an opportunity to clear VM_MAYWRITE, we must be able to invoke the appropriate vm_ops->mmap() handler to do so. We would otherwise fail the mapping_map_writable() check before we had the opportunity to avoid it. This patch moves this check after the call_mmap() invocation. Only memfd actively denies write access causing a potential failure here (in memfd_add_seals()), so there should be no impact on non-memfd cases. This patch makes the userland-visible change that MAP_SHARED, PROT_READ mappings of an F_SEAL_WRITE sealed memfd mapping will now succeed. There is a delicate situation with cleanup paths assuming that a writable mapping must have occurred in circumstances where it may now not have. In order to ensure we do not accidentally mark a writable file unwritable by mistake, we explicitly track whether we have a writable mapping and unmap only if we do. [lstoakes@gmail.com: do not set writable_file_mapping in inappropriate case] Link: https://lkml.kernel.org/r/c9eb4cc6-7db4-4c2b-838d-43a0b319a4f0@lucifer.local Link: https://bugzilla.kernel.org/show_bug.cgi?id=217238 Link: https://lkml.kernel.org/r/55e413d20678a1bb4c7cce889062bbb07b0df892.1697116581.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Andy Lutomirski Cc: Christian Brauner Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/mmap.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 0041e3631f6c..8b57e42fd980 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2752,6 +2752,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long charged = 0; unsigned long end = addr + len; unsigned long merge_start = addr, merge_end = end; + bool writable_file_mapping = false; pgoff_t vm_pgoff; int error; VMA_ITERATOR(vmi, mm, addr); @@ -2846,17 +2847,19 @@ cannot_expand: vma->vm_pgoff = pgoff; if (file) { - if (is_shared_maywrite(vm_flags)) { - error = mapping_map_writable(file->f_mapping); - if (error) - goto free_vma; - } - vma->vm_file = get_file(file); error = call_mmap(file, vma); if (error) goto unmap_and_free_vma; + if (vma_is_shared_maywrite(vma)) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto close_and_free_vma; + + writable_file_mapping = true; + } + /* * Expansion is handled above, merging is handled below. * Drivers should not alter the address of the VMA. @@ -2937,7 +2940,7 @@ cannot_expand: /* Once vma denies write, undo our temporary denial count */ unmap_writable: - if (file && is_shared_maywrite(vm_flags)) + if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; ksm_add_vma(vma); @@ -2985,7 +2988,7 @@ unmap_and_free_vma: unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, vma->vm_end, vma->vm_end, true); } - if (file && is_shared_maywrite(vm_flags)) + if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); free_vma: vm_area_free(vma); -- cgit From fa8c4f9a665bd0cf5a475327aea4fd179e896216 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 11 Aug 2023 17:08:19 +0800 Subject: mm: fix draining remote pageset If there is no memory allocation/freeing in the PCP (Per-CPU Pageset) of a remote zone (zone in remote NUMA node) after some time (3 seconds for now), the pages of the PCP of the remote zone will be drained to avoid memory wastage. This behavior was introduced in the commit 4ae7c03943fc ("[PATCH] Periodically drain non local pagesets") and the commit 4037d452202e ("Move remote node draining out of slab allocators") But, after the commit 7cc36bbddde5 ("vmstat: on-demand vmstat workers V8"), the vmstat updater worker which is used to drain the PCP of remote zones may not be re-queued when we are waiting for the timeout (pcp->expire != 0) if there are no vmstat changes on this CPU, for example, when the CPU goes idle or runs user space only workloads. This may cause the pages of a remote zone be kept in PCP of this CPU for long time. So that, the page reclaiming of the remote zone may be triggered prematurely. This isn't a severe problem in practice, because the PCP of the remote zone will be drained if some memory are allocated/freed again on this CPU. And, the PCP will eventually be drained during the direct reclaiming if necessary. Anyway, the problem still deserves a fix via guaranteeing that the vmstat updater worker will always be re-queued when we are waiting for the timeout. In effect, this restores the original behavior before the commit 7cc36bbddde5. We can reproduce the bug via allocating/freeing pages from a remote zone then go idle as follows. And the patch can fix it. - Run some workloads, use `numactl` to bind CPU to node 0 and memory to node 1. So the PCP of the CPU on node 0 for zone on node 1 will be filled. - After workloads finish, idle for 60s - Check /proc/zoneinfo With the original kernel, the number of pages in the PCP of the CPU on node 0 for zone on node 1 is non-zero after idle. With the patched kernel, it becomes 0 after idle. That is, we avoid to keep pages in the remote PCP during idle. Link: https://lkml.kernel.org/r/20231007062356.187621-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20230811090819.60845-1-ying.huang@intel.com Fixes: 7cc36bbddde5 ("vmstat: on-demand vmstat workers V8") Signed-off-by: "Huang, Ying" Reviewed-by: Christoph Lameter Cc: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/vmstat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 894e4c88d241..88ea95d4221c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -857,8 +857,10 @@ static int refresh_cpu_vm_stats(bool do_pagesets) continue; } - if (__this_cpu_dec_return(pcp->expire)) + if (__this_cpu_dec_return(pcp->expire)) { + changes++; continue; + } if (__this_cpu_read(pcp->count)) { drain_zone_pages(zone, this_cpu_ptr(pcp)); -- cgit From d2cf88c27f51361dfe2982e510a444411d2fc78a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:03 -0700 Subject: hugetlb: optimize update_and_free_pages_bulk to avoid lock cycles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Batch hugetlb vmemmap modification operations", v8. When hugetlb vmemmap optimization was introduced, the overhead of enabling the option was measured as described in commit 426e5c429d16 [1]. The summary states that allocating a hugetlb page should be ~2x slower with optimization and freeing a hugetlb page should be ~2-3x slower. Such overhead was deemed an acceptable trade off for the memory savings obtained by freeing vmemmap pages. It was recently reported that the overhead associated with enabling vmemmap optimization could be as high as 190x for hugetlb page allocations. Yes, 190x! Some actual numbers from other environments are: Bare Metal 8 socket Intel(R) Xeon(R) CPU E7-8895 ------------------------------------------------ Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 0 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m4.119s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m4.477s Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 1 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m28.973s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m36.748s VM with 252 vcpus on host with 2 socket AMD EPYC 7J13 Milan ----------------------------------------------------------- Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 0 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m2.463s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m2.931s Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 1 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 2m27.609s time echo 0 > .../hugepages-2048kB/nr_hugepages real 2m29.924s In the VM environment, the slowdown of enabling hugetlb vmemmap optimization resulted in allocation times being 61x slower. A quick profile showed that the vast majority of this overhead was due to TLB flushing. Each time we modify the kernel pagetable we need to flush the TLB. For each hugetlb that is optimized, there could be potentially two TLB flushes performed. One for the vmemmap pages associated with the hugetlb page, and potentially another one if the vmemmap pages are mapped at the PMD level and must be split. The TLB flushes required for the kernel pagetable, result in a broadcast IPI with each CPU having to flush a range of pages, or do a global flush if a threshold is exceeded. So, the flush time increases with the number of CPUs. In addition, in virtual environments the broadcast IPI can’t be accelerated by hypervisor hardware and leads to traps that need to wakeup/IPI all vCPUs which is very expensive. Because of this the slowdown in virtual environments is even worse than bare metal as the number of vCPUS/CPUs is increased. The following series attempts to reduce amount of time spent in TLB flushing. The idea is to batch the vmemmap modification operations for multiple hugetlb pages. Instead of doing one or two TLB flushes for each page, we do two TLB flushes for each batch of pages. One flush after splitting pages mapped at the PMD level, and another after remapping vmemmap associated with all hugetlb pages. Results of such batching are as follows: Bare Metal 8 socket Intel(R) Xeon(R) CPU E7-8895 ------------------------------------------------ next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 0 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m4.719s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m4.245s next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 1 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m7.267s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m13.199s VM with 252 vcpus on host with 2 socket AMD EPYC 7J13 Milan ----------------------------------------------------------- next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 0 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m2.715s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m3.186s next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 1 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m4.799s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m5.273s With batching, results are back in the 2-3x slowdown range. This patch (of 8): update_and_free_pages_bulk is designed to free a list of hugetlb pages back to their associated lower level allocators. This may require allocating vmemmmap pages associated with each hugetlb page. The hugetlb page destructor must be changed before pages are freed to lower level allocators. However, the destructor must be changed under the hugetlb lock. This means there is potentially one lock cycle per page. Minimize the number of lock cycles in update_and_free_pages_bulk by: 1) allocating necessary vmemmap for all hugetlb pages on the list 2) take hugetlb lock and clear destructor for all pages on the list 3) free all pages on list back to low level allocators Link: https://lkml.kernel.org/r/20231019023113.345257-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20231019023113.345257-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Acked-by: James Houghton Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da6f85b7db88..b839080a2a6b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1862,7 +1862,46 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { struct folio *folio, *t_folio; + bool clear_dtor = false; + /* + * First allocate required vmemmmap (if necessary) for all folios on + * list. If vmemmap can not be allocated, we can not free folio to + * lower level allocator, so add back as hugetlb surplus page. + * add_hugetlb_folio() removes the page from THIS list. + * Use clear_dtor to note if vmemmap was successfully allocated for + * ANY page on the list. + */ + list_for_each_entry_safe(folio, t_folio, list, lru) { + if (folio_test_hugetlb_vmemmap_optimized(folio)) { + if (hugetlb_vmemmap_restore(h, &folio->page)) { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_folio(h, folio, true); + spin_unlock_irq(&hugetlb_lock); + } else + clear_dtor = true; + } + } + + /* + * If vmemmmap allocation was performed on any folio above, take lock + * to clear destructor of all folios on list. This avoids the need to + * lock/unlock for each individual folio. + * The assumption is vmemmap allocation was performed on all or none + * of the folios on the list. This is true expect in VERY rare cases. + */ + if (clear_dtor) { + spin_lock_irq(&hugetlb_lock); + list_for_each_entry(folio, list, lru) + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + } + + /* + * Free folios back to low level allocators. vmemmap and destructors + * were taken care of above, so update_and_free_hugetlb_folio will + * not need to take hugetlb lock. + */ list_for_each_entry_safe(folio, t_folio, list, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); -- cgit From d67e32f26713c35669e343edfdce6fa97a7d6bbc Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:04 -0700 Subject: hugetlb: restructure pool allocations Allocation of a hugetlb page for the hugetlb pool is done by the routine alloc_pool_huge_page. This routine will allocate contiguous pages from a low level allocator, prep the pages for usage as a hugetlb page and then add the resulting hugetlb page to the pool. In the 'prep' stage, optional vmemmap optimization is done. For performance reasons we want to perform vmemmap optimization on multiple hugetlb pages at once. To do this, restructure the hugetlb pool allocation code such that vmemmap optimization can be isolated and later batched. The code to allocate hugetlb pages from bootmem was also modified to allow batching. No functional changes, only code restructure. Link: https://lkml.kernel.org/r/20231019023113.345257-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Tested-by: Sergey Senozhatsky Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 141 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b839080a2a6b..559f7c71c596 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1996,16 +1996,21 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) +static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) { folio_set_hugetlb(folio); - hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); } +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) +{ + init_new_hugetlb_folio(h, folio); + hugetlb_vmemmap_optimize(h, &folio->page); +} + static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { __prep_new_hugetlb_folio(h, folio); @@ -2202,16 +2207,9 @@ retry: return page_folio(page); } -/* - * Common helper to allocate a fresh hugetlb page. All specific allocators - * should use this function to get new hugetlb pages - * - * Note that returned page is 'frozen': ref count of head page and all tail - * pages is zero. - */ -static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) +static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct folio *folio; bool retry = false; @@ -2224,6 +2222,7 @@ retry: nid, nmask, node_alloc_noretry); if (!folio) return NULL; + if (hstate_is_gigantic(h)) { if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { /* @@ -2238,32 +2237,81 @@ retry: return NULL; } } - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); return folio; } +static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) +{ + struct folio *folio; + + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, + node_alloc_noretry); + if (folio) + init_new_hugetlb_folio(h, folio); + return folio; +} + /* - * Allocates a fresh page to the hugetlb allocator pool in the node interleaved - * manner. + * Common helper to allocate a fresh hugetlb page. All specific allocators + * should use this function to get new hugetlb pages + * + * Note that returned page is 'frozen': ref count of head page and all tail + * pages is zero. */ -static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, - nodemask_t *node_alloc_noretry) +static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct folio *folio; - int nr_nodes, node; + + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, + node_alloc_noretry); + if (!folio) + return NULL; + + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + return folio; +} + +static void prep_and_add_allocated_folios(struct hstate *h, + struct list_head *folio_list) +{ + unsigned long flags; + struct folio *folio, *tmp_f; + + /* Add all new pool pages to free lists in one lock cycle */ + spin_lock_irqsave(&hugetlb_lock, flags); + list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { + __prep_account_new_huge_page(h, folio_nid(folio)); + enqueue_hugetlb_folio(h, folio); + } + spin_unlock_irqrestore(&hugetlb_lock, flags); +} + +/* + * Allocates a fresh hugetlb page in a node interleaved manner. The page + * will later be added to the appropriate hugetlb pool. + */ +static struct folio *alloc_pool_huge_folio(struct hstate *h, + nodemask_t *nodes_allowed, + nodemask_t *node_alloc_noretry) +{ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nr_nodes, node; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, + struct folio *folio; + + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); - if (folio) { - free_huge_folio(folio); /* free it into the hugepage allocator */ - return 1; - } + if (folio) + return folio; } - return 0; + return NULL; } /* @@ -3302,25 +3350,35 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, */ static void __init gather_bootmem_prealloc(void) { + LIST_HEAD(folio_list); struct huge_bootmem_page *m; + struct hstate *h = NULL, *prev_h = NULL; list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; - struct hstate *h = m->hstate; + + h = m->hstate; + /* + * It is possible to have multiple huge page sizes (hstates) + * in this list. If so, process each size separately. + */ + if (h != prev_h && prev_h != NULL) + prep_and_add_allocated_folios(prev_h, &folio_list); + prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + __prep_new_hugetlb_folio(h, folio); /* If HVO fails, initialize all tail struct pages */ if (!HPageVmemmapOptimized(&folio->page)) hugetlb_folio_init_tail_vmemmap(folio, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); - free_huge_folio(folio); /* add to the hugepage allocator */ + list_add(&folio->lru, &folio_list); /* * We need to restore the 'stolen' pages to totalram_pages @@ -3330,6 +3388,8 @@ static void __init gather_bootmem_prealloc(void) adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } + + prep_and_add_allocated_folios(h, &folio_list); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) @@ -3363,9 +3423,22 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) h->max_huge_pages_node[nid] = i; } +/* + * NOTE: this routine is called in different contexts for gigantic and + * non-gigantic pages. + * - For gigantic pages, this is called early in the boot process and + * pages are allocated from memblock allocated or something similar. + * Gigantic pages are actually added to pools later with the routine + * gather_bootmem_prealloc. + * - For non-gigantic pages, this is called later in the boot process after + * all of mm is up and functional. Pages are allocated from buddy and + * then added to hugetlb pools. + */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; + struct folio *folio; + LIST_HEAD(folio_list); nodemask_t *node_alloc_noretry; bool node_specific_alloc = false; @@ -3407,14 +3480,25 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { + /* + * gigantic pages not added to list as they are not + * added to pools now. + */ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; - } else if (!alloc_pool_huge_page(h, - &node_states[N_MEMORY], - node_alloc_noretry)) - break; + } else { + folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + node_alloc_noretry); + if (!folio) + break; + list_add(&folio->lru, &folio_list); + } cond_resched(); } + + /* list will be empty if hstate_is_gigantic */ + prep_and_add_allocated_folios(h, &folio_list); + if (i < h->max_huge_pages) { char buf[32]; @@ -3548,7 +3632,9 @@ found: static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { - unsigned long min_count, ret; + unsigned long min_count; + unsigned long allocated; + struct folio *folio; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); @@ -3625,7 +3711,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, break; } - while (count > persistent_huge_pages(h)) { + allocated = 0; + while (count > (persistent_huge_pages(h) + allocated)) { /* * If this allocation races such that we no longer need the * page, free_huge_folio will handle it by freeing the page @@ -3636,15 +3723,32 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, /* yield cpu to avoid soft lockup */ cond_resched(); - ret = alloc_pool_huge_page(h, nodes_allowed, + folio = alloc_pool_huge_folio(h, nodes_allowed, node_alloc_noretry); - spin_lock_irq(&hugetlb_lock); - if (!ret) + if (!folio) { + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); goto out; + } + + list_add(&folio->lru, &page_list); + allocated++; /* Bail for signals. Probably ctrl-c from user */ - if (signal_pending(current)) + if (signal_pending(current)) { + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); goto out; + } + + spin_lock_irq(&hugetlb_lock); + } + + /* Add allocated pages to the pool */ + if (!list_empty(&page_list)) { + spin_unlock_irq(&hugetlb_lock); + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); } /* @@ -3670,8 +3774,6 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { - struct folio *folio; - folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); if (!folio) break; -- cgit From 79359d6d24df2f304abbf6f137b01d2b76175ce3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:05 -0700 Subject: hugetlb: perform vmemmap optimization on a list of pages When adding hugetlb pages to the pool, we first create a list of the allocated pages before adding to the pool. Pass this list of pages to a new routine hugetlb_vmemmap_optimize_folios() for vmemmap optimization. Due to significant differences in vmemmmap initialization for bootmem allocated hugetlb pages, a new routine prep_and_add_bootmem_folios is created. We also modify the routine vmemmap_should_optimize() to check for pages that are already optimized. There are code paths that might request vmemmap optimization twice and we want to make sure this is not attempted. Link: https://lkml.kernel.org/r/20231019023113.345257-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 43 +++++++++++++++++++++++++++++++++++-------- mm/hugetlb_vmemmap.c | 11 +++++++++++ mm/hugetlb_vmemmap.h | 5 +++++ 3 files changed, 51 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 559f7c71c596..8b171f866d0a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2282,6 +2282,9 @@ static void prep_and_add_allocated_folios(struct hstate *h, unsigned long flags; struct folio *folio, *tmp_f; + /* Send list for bulk vmemmap optimization processing */ + hugetlb_vmemmap_optimize_folios(h, folio_list); + /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { @@ -3344,6 +3347,35 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, prep_compound_head((struct page *)folio, huge_page_order(h)); } +static void __init prep_and_add_bootmem_folios(struct hstate *h, + struct list_head *folio_list) +{ + unsigned long flags; + struct folio *folio, *tmp_f; + + /* Send list for bulk vmemmap optimization processing */ + hugetlb_vmemmap_optimize_folios(h, folio_list); + + /* Add all new pool pages to free lists in one lock cycle */ + spin_lock_irqsave(&hugetlb_lock, flags); + list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { + if (!folio_test_hugetlb_vmemmap_optimized(folio)) { + /* + * If HVO fails, initialize all tail struct pages + * We do not worry about potential long lock hold + * time as this is early in boot and there should + * be no contention. + */ + hugetlb_folio_init_tail_vmemmap(folio, + HUGETLB_VMEMMAP_RESERVE_PAGES, + pages_per_huge_page(h)); + } + __prep_account_new_huge_page(h, folio_nid(folio)); + enqueue_hugetlb_folio(h, folio); + } + spin_unlock_irqrestore(&hugetlb_lock, flags); +} + /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_ORDER) pages. @@ -3364,7 +3396,7 @@ static void __init gather_bootmem_prealloc(void) * in this list. If so, process each size separately. */ if (h != prev_h && prev_h != NULL) - prep_and_add_allocated_folios(prev_h, &folio_list); + prep_and_add_bootmem_folios(prev_h, &folio_list); prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); @@ -3372,12 +3404,7 @@ static void __init gather_bootmem_prealloc(void) hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); - __prep_new_hugetlb_folio(h, folio); - /* If HVO fails, initialize all tail struct pages */ - if (!HPageVmemmapOptimized(&folio->page)) - hugetlb_folio_init_tail_vmemmap(folio, - HUGETLB_VMEMMAP_RESERVE_PAGES, - pages_per_huge_page(h)); + init_new_hugetlb_folio(h, folio); list_add(&folio->lru, &folio_list); /* @@ -3389,7 +3416,7 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } - prep_and_add_allocated_folios(h, &folio_list); + prep_and_add_bootmem_folios(h, &folio_list); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 76682d1d79a7..4558b814ffab 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -483,6 +483,9 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) { + if (HPageVmemmapOptimized((struct page *)head)) + return false; + if (!READ_ONCE(vmemmap_optimize_enabled)) return false; @@ -572,6 +575,14 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) SetHPageVmemmapOptimized(head); } +void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) +{ + struct folio *folio; + + list_for_each_entry(folio, folio_list, lru) + hugetlb_vmemmap_optimize(h, &folio->page); +} + static struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 4573899855d7..c512e388dbb4 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -20,6 +20,7 @@ #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { @@ -48,6 +49,10 @@ static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page { } +static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) +{ +} + static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { return 0; -- cgit From cfb8c75099dbf152db1b075eead5029c5a30ce51 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:06 -0700 Subject: hugetlb: perform vmemmap restoration on a list of pages The routine update_and_free_pages_bulk already performs vmemmap restoration on the list of hugetlb pages in a separate step. In preparation for more functionality to be added in this step, create a new routine hugetlb_vmemmap_restore_folios() that will restore vmemmap for a list of folios. This new routine must provide sufficient feedback about errors and actual restoration performed so that update_and_free_pages_bulk can perform optimally. Special care must be taken when encountering an error from hugetlb_vmemmap_restore_folios. We want to continue making as much forward progress as possible. A new routine bulk_vmemmap_restore_error handles this specific situation. Link: https://lkml.kernel.org/r/20231019023113.345257-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 99 +++++++++++++++++++++++++++++++++++++--------------- mm/hugetlb_vmemmap.c | 38 ++++++++++++++++++++ mm/hugetlb_vmemmap.h | 11 ++++++ 3 files changed, 120 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8b171f866d0a..cf834bb7f820 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1859,50 +1859,93 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, schedule_work(&free_hpage_work); } -static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +static void bulk_vmemmap_restore_error(struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; - bool clear_dtor = false; - /* - * First allocate required vmemmmap (if necessary) for all folios on - * list. If vmemmap can not be allocated, we can not free folio to - * lower level allocator, so add back as hugetlb surplus page. - * add_hugetlb_folio() removes the page from THIS list. - * Use clear_dtor to note if vmemmap was successfully allocated for - * ANY page on the list. - */ - list_for_each_entry_safe(folio, t_folio, list, lru) { - if (folio_test_hugetlb_vmemmap_optimized(folio)) { + if (!list_empty(non_hvo_folios)) { + /* + * Free any restored hugetlb pages so that restore of the + * entire list can be retried. + * The idea is that in the common case of ENOMEM errors freeing + * hugetlb pages with vmemmap we will free up memory so that we + * can allocate vmemmap for more hugetlb pages. + */ + list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { + list_del(&folio->lru); + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + update_and_free_hugetlb_folio(h, folio, false); + cond_resched(); + } + } else { + /* + * In the case where there are no folios which can be + * immediately freed, we loop through the list trying to restore + * vmemmap individually in the hope that someone elsewhere may + * have done something to cause success (such as freeing some + * memory). If unable to restore a hugetlb page, the hugetlb + * page is made a surplus page and removed from the list. + * If are able to restore vmemmap and free one hugetlb page, we + * quit processing the list to retry the bulk operation. + */ + list_for_each_entry_safe(folio, t_folio, folio_list, lru) if (hugetlb_vmemmap_restore(h, &folio->page)) { + list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); - } else - clear_dtor = true; - } + } else { + list_del(&folio->lru); + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + update_and_free_hugetlb_folio(h, folio, false); + cond_resched(); + break; + } } +} + +static void update_and_free_pages_bulk(struct hstate *h, + struct list_head *folio_list) +{ + long ret; + struct folio *folio, *t_folio; + LIST_HEAD(non_hvo_folios); /* - * If vmemmmap allocation was performed on any folio above, take lock - * to clear destructor of all folios on list. This avoids the need to - * lock/unlock for each individual folio. - * The assumption is vmemmap allocation was performed on all or none - * of the folios on the list. This is true expect in VERY rare cases. + * First allocate required vmemmmap (if necessary) for all folios. + * Carefully handle errors and free up any available hugetlb pages + * in an effort to make forward progress. */ - if (clear_dtor) { +retry: + ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios); + if (ret < 0) { + bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios); + goto retry; + } + + /* + * At this point, list should be empty, ret should be >= 0 and there + * should only be pages on the non_hvo_folios list. + * Do note that the non_hvo_folios list could be empty. + * Without HVO enabled, ret will be 0 and there is no need to call + * __clear_hugetlb_destructor as this was done previously. + */ + VM_WARN_ON(!list_empty(folio_list)); + VM_WARN_ON(ret < 0); + if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); - list_for_each_entry(folio, list, lru) + list_for_each_entry(folio, &non_hvo_folios, lru) __clear_hugetlb_destructor(h, folio); spin_unlock_irq(&hugetlb_lock); } - /* - * Free folios back to low level allocators. vmemmap and destructors - * were taken care of above, so update_and_free_hugetlb_folio will - * not need to take hugetlb lock. - */ - list_for_each_entry_safe(folio, t_folio, list, lru) { + list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4558b814ffab..77f44b81ff01 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -480,6 +480,44 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) return ret; } +/** + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. + * @h: hstate. + * @folio_list: list of folios. + * @non_hvo_folios: Output list of folios for which vmemmap exists. + * + * Return: number of folios for which vmemmap was restored, or an error code + * if an error was encountered restoring vmemmap for a folio. + * Folios that have vmemmap are moved to the non_hvo_folios + * list. Processing of entries stops when the first error is + * encountered. The folio that experienced the error and all + * non-processed folios will remain on folio_list. + */ +long hugetlb_vmemmap_restore_folios(const struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios) +{ + struct folio *folio, *t_folio; + long restored = 0; + long ret = 0; + + list_for_each_entry_safe(folio, t_folio, folio_list, lru) { + if (folio_test_hugetlb_vmemmap_optimized(folio)) { + ret = hugetlb_vmemmap_restore(h, &folio->page); + if (ret) + break; + restored++; + } + + /* Add non-optimized folios to output list */ + list_move(&folio->lru, non_hvo_folios); + } + + if (!ret) + ret = restored; + return ret; +} + /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) { diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index c512e388dbb4..a0dcf49f46ba 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -19,6 +19,9 @@ #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +long hugetlb_vmemmap_restore_folios(const struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios); void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); @@ -45,6 +48,14 @@ static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *h return 0; } +static long hugetlb_vmemmap_restore_folios(const struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios) +{ + list_splice_init(folio_list, non_hvo_folios); + return 0; +} + static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { } -- cgit From 91f386bf0772c1976622bd0b119b78094603c3d0 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:07 -0700 Subject: hugetlb: batch freeing of vmemmap pages Now that batching of hugetlb vmemmap optimization processing is possible, batch the freeing of vmemmap pages. When freeing vmemmap pages for a hugetlb page, we add them to a list that is freed after the entire batch has been processed. This enhances the ability to return contiguous ranges of memory to the low level allocators. Link: https://lkml.kernel.org/r/20231019023113.345257-6-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 82 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 77f44b81ff01..4ac521e596db 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -251,7 +251,7 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, } entry = mk_pte(walk->reuse_page, pgprot); - list_add_tail(&page->lru, walk->vmemmap_pages); + list_add(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } @@ -306,18 +306,20 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. + * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers + * responsibility to free pages. * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, - unsigned long reuse) + unsigned long reuse, + struct list_head *vmemmap_pages) { int ret; - LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, - .vmemmap_pages = &vmemmap_pages, + .vmemmap_pages = vmemmap_pages, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; @@ -334,7 +336,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, if (walk.reuse_page) { copy_page(page_to_virt(walk.reuse_page), (void *)walk.reuse_addr); - list_add(&walk.reuse_page->lru, &vmemmap_pages); + list_add(&walk.reuse_page->lru, vmemmap_pages); } /* @@ -365,15 +367,13 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, walk = (struct vmemmap_remap_walk) { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, - .vmemmap_pages = &vmemmap_pages, + .vmemmap_pages = vmemmap_pages, }; vmemmap_remap_range(reuse, end, &walk); } mmap_read_unlock(&init_mm); - free_vmemmap_page_list(&vmemmap_pages); - return ret; } @@ -389,7 +389,7 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, page = alloc_pages_node(nid, gfp_mask, 0); if (!page) goto out; - list_add_tail(&page->lru, list); + list_add(&page->lru, list); } return 0; @@ -577,24 +577,17 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h return true; } -/** - * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. - * @h: struct hstate. - * @head: the head page whose vmemmap pages will be optimized. - * - * This function only tries to optimize @head's vmemmap pages and does not - * guarantee that the optimization will succeed after it returns. The caller - * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages - * have been optimized. - */ -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +static int __hugetlb_vmemmap_optimize(const struct hstate *h, + struct page *head, + struct list_head *vmemmap_pages) { + int ret = 0; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE(!PageHuge(head)); if (!vmemmap_should_optimize(h, head)) - return; + return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); @@ -604,21 +597,58 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) /* * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) - * to the page which @vmemmap_reuse is mapped to, then free the pages - * which the range [@vmemmap_start, @vmemmap_end] is mapped to. + * to the page which @vmemmap_reuse is mapped to. Add pages previously + * mapping the range to vmemmap_pages list so that they can be freed by + * the caller. */ - if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse)) + ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages); + if (ret) static_branch_dec(&hugetlb_optimize_vmemmap_key); else SetHPageVmemmapOptimized(head); + + return ret; +} + +/** + * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be optimized. + * + * This function only tries to optimize @head's vmemmap pages and does not + * guarantee that the optimization will succeed after it returns. The caller + * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages + * have been optimized. + */ +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +{ + LIST_HEAD(vmemmap_pages); + + __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages); + free_vmemmap_page_list(&vmemmap_pages); } void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { struct folio *folio; + LIST_HEAD(vmemmap_pages); + + list_for_each_entry(folio, folio_list, lru) { + int ret = __hugetlb_vmemmap_optimize(h, &folio->page, + &vmemmap_pages); + + /* + * Pages to be freed may have been accumulated. If we + * encounter an ENOMEM, free what we have and try again. + */ + if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { + free_vmemmap_page_list(&vmemmap_pages); + INIT_LIST_HEAD(&vmemmap_pages); + __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); + } + } - list_for_each_entry(folio, folio_list, lru) - hugetlb_vmemmap_optimize(h, &folio->page); + free_vmemmap_page_list(&vmemmap_pages); } static struct ctl_table hugetlb_vmemmap_sysctls[] = { -- cgit From f4b7e3efaddbf8fa7cffacb5956b0823b7a7730a Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 18 Oct 2023 19:31:08 -0700 Subject: hugetlb: batch PMD split for bulk vmemmap dedup In an effort to minimize amount of TLB flushes, batch all PMD splits belonging to a range of pages in order to perform only 1 (global) TLB flush. Add a flags field to the walker and pass whether it's a bulk allocation or just a single page to decide to remap. First value (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB flush when we split the PMD. Rebased and updated by Mike Kravetz Link: https://lkml.kernel.org/r/20231019023113.345257-7-mike.kravetz@oracle.com Signed-off-by: Joao Martins Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4ac521e596db..10739e4285d5 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -27,6 +27,8 @@ * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. + * @flags: used to modify behavior in vmemmap page table walking + * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, @@ -35,9 +37,13 @@ struct vmemmap_remap_walk { struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; + +/* Skip the TLB flush when we split the PMD */ +#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) + unsigned long flags; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) { pmd_t __pmd; int i; @@ -80,7 +86,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); - flush_tlb_kernel_range(start, start + PMD_SIZE); + if (flush) + flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } @@ -127,11 +134,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, do { int ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, + !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)); if (ret) return ret; next = pmd_addr_end(addr, end); + + /* + * We are only splitting, not remapping the hugetlb vmemmap + * pages. + */ + if (!walk->remap_pte) + continue; + vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -198,7 +214,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); + if (walk->remap_pte) + flush_tlb_kernel_range(start, end); return 0; } @@ -297,6 +314,36 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } +/** + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) + * backing PMDs of the directmap into PTEs + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_split(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + struct vmemmap_remap_walk walk = { + .remap_pte = NULL, + .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_read_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return ret; +} + /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to the page which @reuse is mapped to, then free vmemmap @@ -320,6 +367,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, + .flags = 0, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; @@ -368,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, + .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); @@ -419,6 +468,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, + .flags = 0, }; /* See the comment in the vmemmap_remap_free(). */ @@ -628,11 +678,45 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) free_vmemmap_page_list(&vmemmap_pages); } +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head) +{ + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; + + if (!vmemmap_should_optimize(h, head)) + return 0; + + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; + + /* + * Split PMDs on the vmemmap virtual address range [@vmemmap_start, + * @vmemmap_end] + */ + return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); +} + void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { struct folio *folio; LIST_HEAD(vmemmap_pages); + list_for_each_entry(folio, folio_list, lru) { + int ret = hugetlb_vmemmap_split(h, &folio->page); + + /* + * Spliting the PMD requires allocating a page, thus lets fail + * early once we encounter the first OOM. No point in retrying + * as it can be dynamically done on remap with the memory + * we get back from the vmemmap deduplication. + */ + if (ret == -ENOMEM) + break; + } + + flush_tlb_all(); + list_for_each_entry(folio, folio_list, lru) { int ret = __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); -- cgit From f13b83fdd996409e3dbf6f34c7629a9d13fdb9c1 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 18 Oct 2023 19:31:09 -0700 Subject: hugetlb: batch TLB flushes when freeing vmemmap Now that a list of pages is deduplicated at once, the TLB flush can be batched for all vmemmap pages that got remapped. Expand the flags field value to pass whether to skip the TLB flush on remap of the PTE. The TLB flush is global as we don't have guarantees from caller that the set of folios is contiguous, or to add complexity in composing a list of kVAs to flush. Modified by Mike Kravetz to perform TLB flush on single folio if an error is encountered. Link: https://lkml.kernel.org/r/20231019023113.345257-8-mike.kravetz@oracle.com Signed-off-by: Joao Martins Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 49 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 10739e4285d5..9df350372046 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -40,6 +40,8 @@ struct vmemmap_remap_walk { /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) +/* Skip the TLB flush when we remap the PTE */ +#define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) unsigned long flags; }; @@ -214,7 +216,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - if (walk->remap_pte) + if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end); return 0; @@ -355,19 +357,21 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end, * @reuse: reuse address. * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. + * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse, - struct list_head *vmemmap_pages) + struct list_head *vmemmap_pages, + unsigned long flags) { int ret; struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, - .flags = 0, + .flags = flags, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; @@ -629,7 +633,8 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h static int __hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head, - struct list_head *vmemmap_pages) + struct list_head *vmemmap_pages, + unsigned long flags) { int ret = 0; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; @@ -640,6 +645,18 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); + /* + * Very Subtle + * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed + * immediately after remapping. As a result, subsequent accesses + * and modifications to struct pages associated with the hugetlb + * page could be to the OLD struct pages. Set the vmemmap optimized + * flag here so that it is copied to the new head page. This keeps + * the old and new struct pages in sync. + * If there is an error during optimization, we will immediately FLUSH + * the TLB and clear the flag below. + */ + SetHPageVmemmapOptimized(head); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; @@ -651,11 +668,12 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, * mapping the range to vmemmap_pages list so that they can be freed by * the caller. */ - ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages); - if (ret) + ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, + vmemmap_pages, flags); + if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); - else - SetHPageVmemmapOptimized(head); + ClearHPageVmemmapOptimized(head); + } return ret; } @@ -674,7 +692,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { LIST_HEAD(vmemmap_pages); - __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages); + __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } @@ -719,19 +737,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l list_for_each_entry(folio, folio_list, lru) { int ret = __hugetlb_vmemmap_optimize(h, &folio->page, - &vmemmap_pages); + &vmemmap_pages, + VMEMMAP_REMAP_NO_TLB_FLUSH); /* * Pages to be freed may have been accumulated. If we * encounter an ENOMEM, free what we have and try again. + * This can occur in the case that both spliting fails + * halfway and head page allocation also failed. In this + * case __hugetlb_vmemmap_optimize() would free memory + * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { + flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); - __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); + __hugetlb_vmemmap_optimize(h, &folio->page, + &vmemmap_pages, + VMEMMAP_REMAP_NO_TLB_FLUSH); } } + flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); } -- cgit From c24f188b22892908a2a3bb2de0ce7d121dd72989 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:10 -0700 Subject: hugetlb: batch TLB flushes when restoring vmemmap Update the internal hugetlb restore vmemmap code path such that TLB flushing can be batched. Use the existing mechanism of passing the VMEMMAP_REMAP_NO_TLB_FLUSH flag to indicate flushing should not be performed for individual pages. The routine hugetlb_vmemmap_restore_folios is the only user of this new mechanism, and it will perform a global flush after all vmemmap is restored. Link: https://lkml.kernel.org/r/20231019023113.345257-9-mike.kravetz@oracle.com Signed-off-by: Joao Martins Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 9df350372046..d2999c303031 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -461,18 +461,19 @@ out: * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. + * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse) + unsigned long reuse, unsigned long flags) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, - .flags = 0, + .flags = flags, }; /* See the comment in the vmemmap_remap_free(). */ @@ -494,17 +495,7 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); -/** - * hugetlb_vmemmap_restore - restore previously optimized (by - * hugetlb_vmemmap_optimize()) vmemmap pages which - * will be reallocated and remapped. - * @h: struct hstate. - * @head: the head page whose vmemmap pages will be restored. - * - * Return: %0 if @head's vmemmap pages have been reallocated and remapped, - * negative error code otherwise. - */ -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, unsigned long flags) { int ret; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; @@ -525,7 +516,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ - ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse); + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); if (!ret) { ClearHPageVmemmapOptimized(head); static_branch_dec(&hugetlb_optimize_vmemmap_key); @@ -534,6 +525,21 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) return ret; } +/** + * hugetlb_vmemmap_restore - restore previously optimized (by + * hugetlb_vmemmap_optimize()) vmemmap pages which + * will be reallocated and remapped. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be restored. + * + * Return: %0 if @head's vmemmap pages have been reallocated and remapped, + * negative error code otherwise. + */ +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +{ + return __hugetlb_vmemmap_restore(h, head, 0); +} + /** * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. * @h: hstate. @@ -557,7 +563,8 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { - ret = hugetlb_vmemmap_restore(h, &folio->page); + ret = __hugetlb_vmemmap_restore(h, &folio->page, + VMEMMAP_REMAP_NO_TLB_FLUSH); if (ret) break; restored++; @@ -567,6 +574,8 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, list_move(&folio->lru, non_hvo_folios); } + if (restored) + flush_tlb_all(); if (!ret) ret = restored; return ret; -- cgit From c5ad3233ead566d74918bd76ba2dbdbe83ba1d9d Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 11 Oct 2023 15:45:57 +0100 Subject: hugetlb_vmemmap: use folio argument for hugetlb_vmemmap_* functions Most function calls in hugetlb.c are made with folio arguments. This brings hugetlb_vmemmap calls inline with them by using folio instead of head struct page. Head struct page is still needed within these functions. The set/clear/test functions for hugepages are also changed to folio versions. Link: https://lkml.kernel.org/r/20231011144557.1720481-2-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Cc: Fam Zheng Cc: Mike Kravetz Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++------- mm/hugetlb_vmemmap.c | 50 ++++++++++++++++++++++++++------------------------ mm/hugetlb_vmemmap.h | 8 ++++---- 3 files changed, 37 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cf834bb7f820..dd8065e36038 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1747,10 +1747,10 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, /* * If folio is not vmemmap optimized (!clear_dtor), then the folio - * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore + * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ - if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) { + if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1893,7 +1893,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h, * quit processing the list to retry the bulk operation. */ list_for_each_entry_safe(folio, t_folio, folio_list, lru) - if (hugetlb_vmemmap_restore(h, &folio->page)) { + if (hugetlb_vmemmap_restore_folio(h, folio)) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); @@ -2051,7 +2051,7 @@ static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { init_new_hugetlb_folio(h, folio); - hugetlb_vmemmap_optimize(h, &folio->page); + hugetlb_vmemmap_optimize_folio(h, folio); } static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) @@ -2462,7 +2462,7 @@ retry: * non-vmemmap optimized hugetlb folios. */ if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore(h, &folio->page); + rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, false); @@ -3886,11 +3886,11 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) /* * If vmemmap already existed for folio, the remove routine above would * have cleared the hugetlb folio flag. Hence the folio is technically - * no longer a hugetlb folio. hugetlb_vmemmap_restore can only be + * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be * passed hugetlb folios and will BUG otherwise. */ if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore(h, &folio->page); + rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index d2999c303031..87818ee7f01d 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -495,14 +495,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); -static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, unsigned long flags) +static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) { int ret; + struct page *head = &folio->page; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE(!PageHuge(head)); - if (!HPageVmemmapOptimized(head)) + if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); @@ -518,7 +519,7 @@ static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, */ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); if (!ret) { - ClearHPageVmemmapOptimized(head); + folio_clear_hugetlb_vmemmap_optimized(folio); static_branch_dec(&hugetlb_optimize_vmemmap_key); } @@ -526,18 +527,18 @@ static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, } /** - * hugetlb_vmemmap_restore - restore previously optimized (by - * hugetlb_vmemmap_optimize()) vmemmap pages which + * hugetlb_vmemmap_restore_folio - restore previously optimized (by + * hugetlb_vmemmap_optimize_folio()) vmemmap pages which * will be reallocated and remapped. * @h: struct hstate. - * @head: the head page whose vmemmap pages will be restored. + * @folio: the folio whose vmemmap pages will be restored. * - * Return: %0 if @head's vmemmap pages have been reallocated and remapped, + * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, * negative error code otherwise. */ -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { - return __hugetlb_vmemmap_restore(h, head, 0); + return __hugetlb_vmemmap_restore_folio(h, folio, 0); } /** @@ -563,7 +564,7 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { - ret = __hugetlb_vmemmap_restore(h, &folio->page, + ret = __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_REMAP_NO_TLB_FLUSH); if (ret) break; @@ -640,12 +641,13 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h return true; } -static int __hugetlb_vmemmap_optimize(const struct hstate *h, - struct page *head, +static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, + struct folio *folio, struct list_head *vmemmap_pages, unsigned long flags) { int ret = 0; + struct page *head = &folio->page; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; @@ -665,7 +667,7 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, * If there is an error during optimization, we will immediately FLUSH * the TLB and clear the flag below. */ - SetHPageVmemmapOptimized(head); + folio_set_hugetlb_vmemmap_optimized(folio); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; @@ -681,27 +683,27 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, vmemmap_pages, flags); if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); - ClearHPageVmemmapOptimized(head); + folio_clear_hugetlb_vmemmap_optimized(folio); } return ret; } /** - * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. + * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. * @h: struct hstate. - * @head: the head page whose vmemmap pages will be optimized. + * @folio: the folio whose vmemmap pages will be optimized. * - * This function only tries to optimize @head's vmemmap pages and does not + * This function only tries to optimize @folio's vmemmap pages and does not * guarantee that the optimization will succeed after it returns. The caller - * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages - * have been optimized. + * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's + * vmemmap pages have been optimized. */ -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); - __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } @@ -745,7 +747,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { - int ret = __hugetlb_vmemmap_optimize(h, &folio->page, + int ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_REMAP_NO_TLB_FLUSH); @@ -754,14 +756,14 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l * encounter an ENOMEM, free what we have and try again. * This can occur in the case that both spliting fails * halfway and head page allocation also failed. In this - * case __hugetlb_vmemmap_optimize() would free memory + * case __hugetlb_vmemmap_optimize_folio() would free memory * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); - __hugetlb_vmemmap_optimize(h, &folio->page, + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_REMAP_NO_TLB_FLUSH); } diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index a0dcf49f46ba..9fd0cb270502 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -18,11 +18,11 @@ #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio); long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios); -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) @@ -43,7 +43,7 @@ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate return size > 0 ? size : 0; } #else -static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return 0; } @@ -56,7 +56,7 @@ static long hugetlb_vmemmap_restore_folios(const struct hstate *h, return 0; } -static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { } -- cgit From 4093602d6bbb2c82b922d1b442a022a069cdb59e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:59 +0100 Subject: nilfs2: convert nilfs_copy_page() to nilfs_copy_folio() Both callers already have a folio, so pass it in and use it directly. Removes a lot of hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- mm/util.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 8cbbfd3a3d59..eefa0336d38c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -799,6 +799,7 @@ void folio_copy(struct folio *dst, struct folio *src) cond_resched(); } } +EXPORT_SYMBOL(folio_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; -- cgit From 09aec5f9b2505c73a9380800f61ad920cc64a7cd Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Mon, 16 Oct 2023 16:34:46 +0100 Subject: mm: kmsan: panic on failure to allocate early boot metadata Given large enough allocations and a machine with low enough memory (i.e a default QEMU VM), it's entirely possible that kmsan_init_alloc_meta_for_range's shadow+origin allocation fails. Instead of eating a NULL deref kernel oops, check explicitly for memblock_alloc() failure and panic with a nice error message. Alexander Potapenko said: For posterity, it is generally quite important for the allocated shadow and origin to be contiguous, otherwise an unaligned memory write may result in memory corruption (the corresponding unaligned shadow write will be assuming that shadow pages are adjacent). So instead of panicking we could have split the range into smaller ones until the allocation succeeds, but that would've led to hard-to-debug problems in the future. Link: https://lkml.kernel.org/r/20231016153446.132763-1-pedro.falcato@gmail.com Signed-off-by: Pedro Falcato Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/kmsan/shadow.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 87318f9170f1..b9d05aff313e 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -285,12 +285,17 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end) size = PAGE_ALIGN((u64)end - (u64)start); shadow = memblock_alloc(size, PAGE_SIZE); origin = memblock_alloc(size, PAGE_SIZE); + + if (!shadow || !origin) + panic("%s: Failed to allocate metadata memory for early boot range of size %llu", + __func__, size); + for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { page = virt_to_page_or_null((char *)start + addr); - shadow_p = virt_to_page_or_null((char *)shadow + addr); + shadow_p = virt_to_page((char *)shadow + addr); set_no_shadow_origin_page(shadow_p); shadow_page_for(page) = shadow_p; - origin_p = virt_to_page_or_null((char *)origin + addr); + origin_p = virt_to_page((char *)origin + addr); set_no_shadow_origin_page(origin_p); origin_page_for(page) = origin_p; } -- cgit From 1f4f7f0f8845dbac40289cc3d50b81314c5a12b8 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 16 Oct 2023 19:31:03 +0800 Subject: mm/oom_killer: simplify OOM killer info dump helper There is only one caller wants to dump the kill victim info, so just let it call the standalone helper, no need to make the generic info dump helper take an extra argument for that. Result of bloat-o-meter: ./scripts/bloat-o-meter ./mm/oom_kill.old.o ./mm/oom_kill.o add/remove: 0/0 grow/shrink: 1/2 up/down: 131/-142 (-11) Function old new delta oom_kill_process 412 543 +131 out_of_memory 1422 1418 -4 dump_header 562 424 -138 Total: Before=21514, After=21503, chg -0.05% Link: https://lkml.kernel.org/r/20231016113103.86477-1-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Michal Hocko Cc: Christian Brauner Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/oom_kill.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 44bde56ecd02..9e6071fde34a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -437,7 +437,7 @@ static void dump_tasks(struct oom_control *oc) } } -static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) +static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) { /* one line summary of the oom killer context. */ pr_info("oom-kill:constraint=%s,nodemask=%*pbl", @@ -449,7 +449,7 @@ static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) from_kuid(&init_user_ns, task_uid(victim))); } -static void dump_header(struct oom_control *oc, struct task_struct *p) +static void dump_header(struct oom_control *oc) { pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, @@ -467,8 +467,6 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) } if (sysctl_oom_dump_tasks) dump_tasks(oc); - if (p) - dump_oom_summary(oc, p); } /* @@ -1029,8 +1027,10 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } task_unlock(victim); - if (__ratelimit(&oom_rs)) - dump_header(oc, victim); + if (__ratelimit(&oom_rs)) { + dump_header(oc); + dump_oom_victim(oc, victim); + } /* * Do we need to kill the entire memory cgroup? @@ -1072,7 +1072,7 @@ static void check_panic_on_oom(struct oom_control *oc) /* Do not panic for oom kills triggered by sysrq */ if (is_sysrq_oom(oc)) return; - dump_header(oc, NULL); + dump_header(oc); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -1155,7 +1155,7 @@ bool out_of_memory(struct oom_control *oc) select_bad_process(oc); /* Found nothing?!?! */ if (!oc->chosen) { - dump_header(oc, NULL); + dump_header(oc); pr_warn("Out of memory and no killable processes...\n"); /* * If we got here due to an actual allocation at the -- cgit From ca71fe1ad9221a89c6a25f49159c600d9e598ae1 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:54 +0800 Subject: mm, pcp: avoid to drain PCP when process exit Patch series "mm: PCP high auto-tuning", v3. The page allocation performance requirements of different workloads are often different. So, we need to tune the PCP (Per-CPU Pageset) high on each CPU automatically to optimize the page allocation performance. The list of patches in series is as follows, [1/9] mm, pcp: avoid to drain PCP when process exit [2/9] cacheinfo: calculate per-CPU data cache size [3/9] mm, pcp: reduce lock contention for draining high-order pages [4/9] mm: restrict the pcp batch scale factor to avoid too long latency [5/9] mm, page_alloc: scale the number of pages that are batch allocated [6/9] mm: add framework for PCP high auto-tuning [7/9] mm: tune PCP high automatically [8/9] mm, pcp: decrease PCP high if free pages < high watermark [9/9] mm, pcp: reduce detecting time of consecutive high order page freeing Patch [1/9], [2/9], [3/9] optimize the PCP draining for consecutive high-order pages freeing. Patch [4/9], [5/9] optimize batch freeing and allocating. Patch [6/9], [7/9], [8/9] implement and optimize a PCP high auto-tuning method. Patch [9/9] optimize the PCP draining for consecutive high order page freeing based on PCP high auto-tuning. The test results for patches with performance impact are as follows, kbuild ====== On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. build time lock contend% free_high alloc_zone ---------- ---------- --------- ---------- base 100.0 14.0 100.0 100.0 patch1 99.5 12.8 19.5 95.6 patch3 99.4 12.6 7.1 95.6 patch5 98.6 11.0 8.1 97.1 patch7 95.1 0.5 2.8 15.6 patch9 95.0 1.0 8.8 20.0 The PCP draining optimization (patch [1/9], [3/9]) and PCP batch allocation optimization (patch [5/9]) reduces zone lock contention a little. The PCP high auto-tuning (patch [7/9], [9/9]) reduces build time visibly. Where the tuning target: the number of pages allocated from zone reduces greatly. So, the zone contention cycles% reduces greatly. With PCP tuning patches (patch [7/9], [9/9]), the average used memory during test increases up to 18.4% because more pages are cached in PCP. But at the end of the test, the number of the used memory decreases to the same level as that of the base patch. That is, the pages cached in PCP will be released to zone after not being used actively. netperf SCTP_STREAM_MANY ======================== On a 2-socket Intel server with 128 logical CPU, we tested SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes. score lock contend% free_high alloc_zone cache miss rate% ----- ---------- --------- ---------- ---------------- base 100.0 2.1 100.0 100.0 1.3 patch1 99.4 2.1 99.4 99.4 1.3 patch3 106.4 1.3 13.3 106.3 1.3 patch5 106.0 1.2 13.2 105.9 1.3 patch7 103.4 1.9 6.7 90.3 7.6 patch9 108.6 1.3 13.7 108.6 1.3 The PCP draining optimization (patch [1/9]+[3/9]) improves performance. The PCP high auto-tuning (patch [7/9]) reduces performance a little because PCP draining cannot be triggered in time sometimes. So, the cache miss rate% increases. The further PCP draining optimization (patch [9/9]) based on PCP tuning restore the performance. lmbench3 UNIX (AF_UNIX) ======================= On a 2-socket Intel server with 128 logical CPU, we tested UNIX (AF_UNIX socket) test case of lmbench3 test suite with 16-pair processes. score lock contend% free_high alloc_zone cache miss rate% ----- ---------- --------- ---------- ---------------- base 100.0 51.4 100.0 100.0 0.2 patch1 116.8 46.1 69.5 104.3 0.2 patch3 199.1 21.3 7.0 104.9 0.2 patch5 200.0 20.8 7.1 106.9 0.3 patch7 191.6 19.9 6.8 103.8 2.8 patch9 193.4 21.7 7.0 104.7 2.1 The PCP draining optimization (patch [1/9], [3/9]) improves performance much. The PCP tuning (patch [7/9]) reduces performance a little because PCP draining cannot be triggered in time sometimes. The further PCP draining optimization (patch [9/9]) based on PCP tuning restores the performance partly. The patchset adds several fields in struct per_cpu_pages. The struct layout before/after the patchset is as follows, base ==== struct per_cpu_pages { spinlock_t lock; /* 0 4 */ int count; /* 4 4 */ int high; /* 8 4 */ int batch; /* 12 4 */ short int free_factor; /* 16 2 */ short int expire; /* 18 2 */ /* XXX 4 bytes hole, try to pack */ struct list_head lists[13]; /* 24 208 */ /* size: 256, cachelines: 4, members: 7 */ /* sum members: 228, holes: 1, sum holes: 4 */ /* padding: 24 */ } __attribute__((__aligned__(64))); patched ======= struct per_cpu_pages { spinlock_t lock; /* 0 4 */ int count; /* 4 4 */ int high; /* 8 4 */ int high_min; /* 12 4 */ int high_max; /* 16 4 */ int batch; /* 20 4 */ u8 flags; /* 24 1 */ u8 alloc_factor; /* 25 1 */ u8 expire; /* 26 1 */ /* XXX 1 byte hole, try to pack */ short int free_count; /* 28 2 */ /* XXX 2 bytes hole, try to pack */ struct list_head lists[13]; /* 32 208 */ /* size: 256, cachelines: 4, members: 11 */ /* sum members: 237, holes: 2, sum holes: 3 */ /* padding: 16 */ } __attribute__((__aligned__(64))); The size of the struct doesn't changed with the patchset. This patch (of 9): In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when PCP is mostly used for high-order pages freeing to improve the cache-hot pages reusing between page allocation and freeing CPUs. But, the PCP draining mechanism may be triggered unexpectedly when process exits. With some customized trace point, it was found that PCP draining (free_high == true) was triggered with the order-1 page freeing with the following call stack, => free_unref_page_commit => free_unref_page => __mmdrop => exit_mm => do_exit => do_group_exit => __x64_sys_exit_group => do_syscall_64 Checking the source code, this is the page table PGD freeing (mm_free_pgd()). It's a order-1 page freeing if CONFIG_PAGE_TABLE_ISOLATION=y. Which is a common configuration for security. Just before that, page freeing with the following call stack was found, => free_unref_page_commit => free_unref_page_list => release_pages => tlb_batch_pages_flush => tlb_finish_mmu => exit_mmap => __mmput => exit_mm => do_exit => do_group_exit => __x64_sys_exit_group => do_syscall_64 So, when a process exits, - a large number of user pages of the process will be freed without page allocation, it's highly possible that pcp->free_factor becomes > 0. In fact, this is expected behavior to improve process exit performance. - after freeing all user pages, the PGD will be freed, which is a order-1 page freeing, PCP will be drained. All in all, when a process exits, it's high possible that the PCP will be drained. This is an unexpected behavior. To avoid this, in the patch, the PCP draining will only be triggered for 2 consecutive high-order page freeing. On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. With the patch, the cycles% of the spinlock contention (mostly for zone lock) decreases from 14.0% to 12.8% (with PCP size == 367). The number of PCP draining for high order pages freeing (free_high) decreases 80.5%. This helps network workload too for reduced zone lock contention. On a 2-socket Intel server with 128 logical CPU, with the patch, the network bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite with 16-pair processes increase 16.8%. The cycles% of the spinlock contention (mostly for zone lock) decreases from 51.4% to 46.1%. The number of PCP draining for high order pages freeing (free_high) decreases 30.5%. The cache miss rate keeps 0.2%. Link: https://lkml.kernel.org/r/20231016053002.756205-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20231016053002.756205-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- mm/page_alloc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f46e519618a0..de547ef9a9ad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2370,7 +2370,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, { int high; int pindex; - bool free_high; + bool free_high = false; __count_vm_events(PGFREE, 1 << order); pindex = order_to_pindex(migratetype, order); @@ -2383,8 +2383,13 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, * freeing without allocation. The remainder after bulk freeing * stops will be drained from vmstat refresh context. */ - free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER); - + if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { + free_high = (pcp->free_factor && + (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER)); + pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; + } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { + pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; + } high = nr_pcp_high(pcp, zone, free_high); if (pcp->count >= high) { free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex); -- cgit From 362d37a106dd3f6431b2fdd91d9208b0d023b50d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:56 +0800 Subject: mm, pcp: reduce lock contention for draining high-order pages In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when PCP is mostly used for high-order pages freeing to improve the cache-hot pages reusing between page allocating and freeing CPUs. On system with small per-CPU data cache slice, pages shouldn't be cached before draining to guarantee cache-hot. But on a system with large per-CPU data cache slice, some pages can be cached before draining to reduce zone lock contention. So, in this patch, instead of draining without any caching, "pcp->batch" pages will be cached in PCP before draining if the size of the per-CPU data cache slice is more than "3 * batch". In theory, if the size of per-CPU data cache slice is more than "2 * batch", we can reuse cache-hot pages between CPUs. But considering the other usage of cache (code, other data accessing, etc.), "3 * batch" is used. Note: "3 * batch" is chosen to make sure the optimization works on recent x86_64 server CPUs. If you want to increase it, please check whether it breaks the optimization. On a 2-socket Intel server with 128 logical CPU, with the patch, the network bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite with 16-pair processes increase 70.5%. The cycles% of the spinlock contention (mostly for zone lock) decreases from 46.1% to 21.3%. The number of PCP draining for high order pages freeing (free_high) decreases 89.9%. The cache miss rate keeps 0.2%. Link: https://lkml.kernel.org/r/20231016053002.756205-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Sudeep Holla Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Signed-off-by: Andrew Morton --- mm/page_alloc.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index de547ef9a9ad..b76b1de48a30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include "internal.h" #include "shuffle.h" @@ -2385,7 +2386,9 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, */ if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { free_high = (pcp->free_factor && - (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER)); + (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && + (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || + pcp->count >= READ_ONCE(pcp->batch))); pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; @@ -5418,6 +5421,39 @@ static void zone_pcp_update(struct zone *zone, int cpu_online) mutex_unlock(&pcp_batch_high_lock); } +static void zone_pcp_update_cacheinfo(struct zone *zone) +{ + int cpu; + struct per_cpu_pages *pcp; + struct cpu_cacheinfo *cci; + + for_each_online_cpu(cpu) { + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + cci = get_cpu_cacheinfo(cpu); + /* + * If data cache slice of CPU is large enough, "pcp->batch" + * pages can be preserved in PCP before draining PCP for + * consecutive high-order pages freeing without allocation. + * This can reduce zone lock contention without hurting + * cache-hot pages sharing. + */ + spin_lock(&pcp->lock); + if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) + pcp->flags |= PCPF_FREE_HIGH_BATCH; + else + pcp->flags &= ~PCPF_FREE_HIGH_BATCH; + spin_unlock(&pcp->lock); + } +} + +void setup_pcp_cacheinfo(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zone_pcp_update_cacheinfo(zone); +} + /* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. -- cgit From 52166607ecc980391b1fffbce0be3074a96d0c7b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:57 +0800 Subject: mm: restrict the pcp batch scale factor to avoid too long latency In page allocator, PCP (Per-CPU Pageset) is refilled and drained in batches to increase page allocation throughput, reduce page allocation/freeing latency per page, and reduce zone lock contention. But too large batch size will cause too long maximal allocation/freeing latency, which may punish arbitrary users. So the default batch size is chosen carefully (in zone_batchsize(), the value is 63 for zone > 1GB) to avoid that. In commit 3b12e7e97938 ("mm/page_alloc: scale the number of pages that are batch freed"), the batch size will be scaled for large number of page freeing to improve page freeing performance and reduce zone lock contention. Similar optimization can be used for large number of pages allocation too. To find out a suitable max batch scale factor (that is, max effective batch size), some tests and measurement on some machines were done as follows. A set of debug patches are implemented as follows, - Set PCP high to be 2 * batch to reduce the effect of PCP high - Disable free batch size scaling to get the raw performance. - The code with zone lock held is extracted from rmqueue_bulk() and free_pcppages_bulk() to 2 separate functions to make it easy to measure the function run time with ftrace function_graph tracer. - The batch size is hard coded to be 63 (default), 127, 255, 511, 1023, 2047, 4095. Then will-it-scale/page_fault1 is used to generate the page allocation/freeing workload. The page allocation/freeing throughput (page/s) is measured via will-it-scale. The page allocation/freeing average latency (alloc/free latency avg, in us) and allocation/freeing latency at 99 percentile (alloc/free latency 99%, in us) are measured with ftrace function_graph tracer. The test results are as follows, Sapphire Rapids Server ====================== Batch throughput free latency free latency alloc latency alloc latency page/s avg / us 99% / us avg / us 99% / us ----- ---------- ------------ ------------ ------------- ------------- 63 513633.4 2.33 3.57 2.67 6.83 127 517616.7 4.35 6.65 4.22 13.03 255 520822.8 8.29 13.32 7.52 25.24 511 524122.0 15.79 23.42 14.02 49.35 1023 525980.5 30.25 44.19 25.36 94.88 2047 526793.6 59.39 84.50 45.22 140.81 Ice Lake Server =============== Batch throughput free latency free latency alloc latency alloc latency page/s avg / us 99% / us avg / us 99% / us ----- ---------- ------------ ------------ ------------- ------------- 63 620210.3 2.21 3.68 2.02 4.35 127 627003.0 4.09 6.86 3.51 8.28 255 630777.5 7.70 13.50 6.17 15.97 511 633651.5 14.85 22.62 11.66 31.08 1023 637071.1 28.55 42.02 20.81 54.36 2047 638089.7 56.54 84.06 39.28 91.68 Cascade Lake Server =================== Batch throughput free latency free latency alloc latency alloc latency page/s avg / us 99% / us avg / us 99% / us ----- ---------- ------------ ------------ ------------- ------------- 63 404706.7 3.29 5.03 3.53 4.75 127 422475.2 6.12 9.09 6.36 8.76 255 411522.2 11.68 16.97 10.90 16.39 511 428124.1 22.54 31.28 19.86 32.25 1023 414718.4 43.39 62.52 40.00 66.33 2047 429848.7 86.64 120.34 71.14 106.08 Commet Lake Desktop =================== Batch throughput free latency free latency alloc latency alloc latency page/s avg / us 99% / us avg / us 99% / us ----- ---------- ------------ ------------ ------------- ------------- 63 795183.13 2.18 3.55 2.03 3.05 127 803067.85 3.91 6.56 3.85 5.52 255 812771.10 7.35 10.80 7.14 10.20 511 817723.48 14.17 27.54 13.43 30.31 1023 818870.19 27.72 40.10 27.89 46.28 Coffee Lake Desktop =================== Batch throughput free latency free latency alloc latency alloc latency page/s avg / us 99% / us avg / us 99% / us ----- ---------- ------------ ------------ ------------- ------------- 63 510542.8 3.13 4.40 2.48 3.43 127 514288.6 5.97 7.89 4.65 6.04 255 516889.7 11.86 15.58 8.96 12.55 511 519802.4 23.10 28.81 16.95 26.19 1023 520802.7 45.30 52.51 33.19 45.95 2047 519997.1 90.63 104.00 65.26 81.74 From the above data, to restrict the allocation/freeing latency to be less than 100 us in most times, the max batch scale factor needs to be less than or equal to 5. Although it is reasonable to use 5 as max batch scale factor for the systems tested, there are also slower systems. Where smaller value should be used to constrain the page allocation/freeing latency. So, in this patch, a new kconfig option (PCP_BATCH_SCALE_MAX) is added to set the max batch scale factor. Whose default value is 5, and users can reduce it when necessary. Link: https://lkml.kernel.org/r/20231016053002.756205-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Andrew Morton Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- mm/Kconfig | 11 +++++++++++ mm/page_alloc.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 45f2971ef642..89971a894b60 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -705,6 +705,17 @@ config HUGETLB_PAGE_SIZE_VARIABLE config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA +config PCP_BATCH_SCALE_MAX + int "Maximum scale factor of PCP (Per-CPU pageset) batch allocate/free" + default 5 + range 0 6 + help + In page allocator, PCP (Per-CPU pageset) is refilled and drained in + batches. The batch number is scaled automatically to improve page + allocation/free throughput. But too large scale factor may hurt + latency. This option sets the upper limit of scale factor to limit + the maximum latency. + config PHYS_ADDR_T_64BIT def_bool 64BIT diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b76b1de48a30..05a8d34827d2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2340,7 +2340,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) * freeing of pages without any allocation. */ batch <<= pcp->free_factor; - if (batch < max_nr_free) + if (batch < max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX) pcp->free_factor++; batch = clamp(batch, min_nr_free, max_nr_free); -- cgit From c0a242394cb980bd00e1f61dc8aacb453d2bbe6a Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:58 +0800 Subject: mm, page_alloc: scale the number of pages that are batch allocated When a task is allocating a large number of order-0 pages, it may acquire the zone->lock multiple times allocating pages in batches. This may unnecessarily contend on the zone lock when allocating very large number of pages. This patch adapts the size of the batch based on the recent pattern to scale the batch size for subsequent allocations. On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. With the patch, the cycles% of the spinlock contention (mostly for zone lock) decreases from 12.6% to 11.0% (with PCP size == 367). Link: https://lkml.kernel.org/r/20231016053002.756205-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Mel Gorman Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- mm/page_alloc.c | 53 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05a8d34827d2..6f6af835b05b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2373,6 +2373,12 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, int pindex; bool free_high = false; + /* + * On freeing, reduce the number of pages that are batch allocated. + * See nr_pcp_alloc() where alloc_factor is increased for subsequent + * allocations. + */ + pcp->alloc_factor >>= 1; __count_vm_events(PGFREE, 1 << order); pindex = order_to_pindex(migratetype, order); list_add(&page->pcp_list, &pcp->lists[pindex]); @@ -2679,6 +2685,42 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, return page; } +static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order) +{ + int high, batch, max_nr_alloc; + + high = READ_ONCE(pcp->high); + batch = READ_ONCE(pcp->batch); + + /* Check for PCP disabled or boot pageset */ + if (unlikely(high < batch)) + return 1; + + /* + * Double the number of pages allocated each time there is subsequent + * allocation of order-0 pages without any freeing. + */ + if (!order) { + max_nr_alloc = max(high - pcp->count - batch, batch); + batch <<= pcp->alloc_factor; + if (batch <= max_nr_alloc && + pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX) + pcp->alloc_factor++; + batch = min(batch, max_nr_alloc); + } + + /* + * Scale batch relative to order if batch implies free pages + * can be stored on the PCP. Batch can be 1 for small zones or + * for boot pagesets which should never store free pages as + * the pages may belong to arbitrary zones. + */ + if (batch > 1) + batch = max(batch >> order, 2); + + return batch; +} + /* Remove page from the per-cpu list, caller must protect the list */ static inline struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, @@ -2691,18 +2733,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, do { if (list_empty(list)) { - int batch = READ_ONCE(pcp->batch); + int batch = nr_pcp_alloc(pcp, order); int alloced; - /* - * Scale batch relative to order if batch implies - * free pages can be stored on the PCP. Batch can - * be 1 for small zones or for boot pagesets which - * should never store free pages as the pages may - * belong to arbitrary zones. - */ - if (batch > 1) - batch = max(batch >> order, 2); alloced = rmqueue_bulk(zone, order, batch, list, migratetype, alloc_flags); -- cgit From 90b41691b9881376fe784e13b5766ec3676fdb55 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:59 +0800 Subject: mm: add framework for PCP high auto-tuning The page allocation performance requirements of different workloads are usually different. So, we need to tune PCP (per-CPU pageset) high to optimize the workload page allocation performance. Now, we have a system wide sysctl knob (percpu_pagelist_high_fraction) to tune PCP high by hand. But, it's hard to find out the best value by hand. And one global configuration may not work best for the different workloads that run on the same system. One solution to these issues is to tune PCP high of each CPU automatically. This patch adds the framework for PCP high auto-tuning. With it, pcp->high of each CPU will be changed automatically by tuning algorithm at runtime. The minimal high (pcp->high_min) is the original PCP high value calculated based on the low watermark pages. While the maximal high (pcp->high_max) is the PCP high value when percpu_pagelist_high_fraction sysctl knob is set to MIN_PERCPU_PAGELIST_HIGH_FRACTION. That is, the maximal pcp->high that can be set via sysctl knob by hand. It's possible that PCP high auto-tuning doesn't work well for some workloads. So, when PCP high is tuned by hand via the sysctl knob, the auto-tuning will be disabled. The PCP high set by hand will be used instead. This patch only adds the framework, so pcp->high will be set to pcp->high_min (original default) always. We will add actual auto-tuning algorithm in the following patches in the series. Link: https://lkml.kernel.org/r/20231016053002.756205-7-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- mm/page_alloc.c | 71 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6f6af835b05b..922836b6a01d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2350,7 +2350,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, bool free_high) { - int high = READ_ONCE(pcp->high); + int high = READ_ONCE(pcp->high_min); if (unlikely(!high || free_high)) return 0; @@ -2689,7 +2689,7 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order) { int high, batch, max_nr_alloc; - high = READ_ONCE(pcp->high); + high = READ_ONCE(pcp->high_min); batch = READ_ONCE(pcp->batch); /* Check for PCP disabled or boot pageset */ @@ -5296,14 +5296,15 @@ static int zone_batchsize(struct zone *zone) } static int percpu_pagelist_high_fraction; -static int zone_highsize(struct zone *zone, int batch, int cpu_online) +static int zone_highsize(struct zone *zone, int batch, int cpu_online, + int high_fraction) { #ifdef CONFIG_MMU int high; int nr_split_cpus; unsigned long total_pages; - if (!percpu_pagelist_high_fraction) { + if (!high_fraction) { /* * By default, the high value of the pcp is based on the zone * low watermark so that if they are full then background @@ -5316,15 +5317,15 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) * value is based on a fraction of the managed pages in the * zone. */ - total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction; + total_pages = zone_managed_pages(zone) / high_fraction; } /* * Split the high value across all online CPUs local to the zone. Note * that early in boot that CPUs may not be online yet and that during * CPU hotplug that the cpumask is not yet updated when a CPU is being - * onlined. For memory nodes that have no CPUs, split pcp->high across - * all online CPUs to mitigate the risk that reclaim is triggered + * onlined. For memory nodes that have no CPUs, split the high value + * across all online CPUs to mitigate the risk that reclaim is triggered * prematurely due to pages stored on pcp lists. */ nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online; @@ -5352,19 +5353,21 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) * However, guaranteeing these relations at all times would require e.g. write * barriers here but also careful usage of read barriers at the read side, and * thus be prone to error and bad for performance. Thus the update only prevents - * store tearing. Any new users of pcp->batch and pcp->high should ensure they - * can cope with those fields changing asynchronously, and fully trust only the - * pcp->count field on the local CPU with interrupts disabled. + * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max + * should ensure they can cope with those fields changing asynchronously, and + * fully trust only the pcp->count field on the local CPU with interrupts + * disabled. * * mutex_is_locked(&pcp_batch_high_lock) required when calling this function * outside of boot time (or some other assurance that no concurrent updaters * exist). */ -static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, - unsigned long batch) +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high_min, + unsigned long high_max, unsigned long batch) { WRITE_ONCE(pcp->batch, batch); - WRITE_ONCE(pcp->high, high); + WRITE_ONCE(pcp->high_min, high_min); + WRITE_ONCE(pcp->high_max, high_max); } static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) @@ -5384,20 +5387,21 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta * need to be as careful as pageset_update() as nobody can access the * pageset yet. */ - pcp->high = BOOT_PAGESET_HIGH; + pcp->high_min = BOOT_PAGESET_HIGH; + pcp->high_max = BOOT_PAGESET_HIGH; pcp->batch = BOOT_PAGESET_BATCH; pcp->free_factor = 0; } -static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, - unsigned long batch) +static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min, + unsigned long high_max, unsigned long batch) { struct per_cpu_pages *pcp; int cpu; for_each_possible_cpu(cpu) { pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - pageset_update(pcp, high, batch); + pageset_update(pcp, high_min, high_max, batch); } } @@ -5407,19 +5411,34 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h */ static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) { - int new_high, new_batch; + int new_high_min, new_high_max, new_batch; new_batch = max(1, zone_batchsize(zone)); - new_high = zone_highsize(zone, new_batch, cpu_online); + if (percpu_pagelist_high_fraction) { + new_high_min = zone_highsize(zone, new_batch, cpu_online, + percpu_pagelist_high_fraction); + /* + * PCP high is tuned manually, disable auto-tuning via + * setting high_min and high_max to the manual value. + */ + new_high_max = new_high_min; + } else { + new_high_min = zone_highsize(zone, new_batch, cpu_online, 0); + new_high_max = zone_highsize(zone, new_batch, cpu_online, + MIN_PERCPU_PAGELIST_HIGH_FRACTION); + } - if (zone->pageset_high == new_high && + if (zone->pageset_high_min == new_high_min && + zone->pageset_high_max == new_high_max && zone->pageset_batch == new_batch) return; - zone->pageset_high = new_high; + zone->pageset_high_min = new_high_min; + zone->pageset_high_max = new_high_max; zone->pageset_batch = new_batch; - __zone_set_pageset_high_and_batch(zone, new_high, new_batch); + __zone_set_pageset_high_and_batch(zone, new_high_min, new_high_max, + new_batch); } void __meminit setup_zone_pageset(struct zone *zone) @@ -5528,7 +5547,8 @@ __meminit void zone_pcp_init(struct zone *zone) */ zone->per_cpu_pageset = &boot_pageset; zone->per_cpu_zonestats = &boot_zonestats; - zone->pageset_high = BOOT_PAGESET_HIGH; + zone->pageset_high_min = BOOT_PAGESET_HIGH; + zone->pageset_high_max = BOOT_PAGESET_HIGH; zone->pageset_batch = BOOT_PAGESET_BATCH; if (populated_zone(zone)) @@ -6430,13 +6450,14 @@ EXPORT_SYMBOL(free_contig_range); void zone_pcp_disable(struct zone *zone) { mutex_lock(&pcp_batch_high_lock); - __zone_set_pageset_high_and_batch(zone, 0, 1); + __zone_set_pageset_high_and_batch(zone, 0, 0, 1); __drain_all_pages(zone, true); } void zone_pcp_enable(struct zone *zone) { - __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch); + __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min, + zone->pageset_high_max, zone->pageset_batch); mutex_unlock(&pcp_batch_high_lock); } -- cgit From 51a755c56dc05a8b31ed28d24f28354946dc7529 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:30:00 +0800 Subject: mm: tune PCP high automatically The target to tune PCP high automatically is as follows, - Minimize allocation/freeing from/to shared zone - Minimize idle pages in PCP - Minimize pages in PCP if the system free pages is too few To reach these target, a tuning algorithm as follows is designed, - When we refill PCP via allocating from the zone, increase PCP high. Because if we had larger PCP, we could avoid to allocate from the zone. - In periodic vmstat updating kworker (via refresh_cpu_vm_stats()), decrease PCP high to try to free possible idle PCP pages. - When page reclaiming is active for the zone, stop increasing PCP high in allocating path, decrease PCP high and free some pages in freeing path. So, the PCP high can be tuned to the page allocating/freeing depth of workloads eventually. One issue of the algorithm is that if the number of pages allocated is much more than that of pages freed on a CPU, the PCP high may become the maximal value even if the allocating/freeing depth is small. But this isn't a severe issue, because there are no idle pages in this case. One alternative choice is to increase PCP high when we drain PCP via trying to free pages to the zone, but don't increase PCP high during PCP refilling. This can avoid the issue above. But if the number of pages allocated is much less than that of pages freed on a CPU, there will be many idle pages in PCP and it is hard to free these idle pages. 1/8 (>> 3) of PCP high will be decreased periodically. The value 1/8 is kind of arbitrary. Just to make sure that the idle PCP pages will be freed eventually. On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. With the patch, the build time decreases 3.5%. The cycles% of the spinlock contention (mostly for zone lock) decreases from 11.0% to 0.5%. The number of PCP draining for high order pages freeing (free_high) decreases 65.6%. The number of pages allocated from zone (instead of from PCP) decreases 83.9%. Link: https://lkml.kernel.org/r/20231016053002.756205-8-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Mel Gorman Suggested-by: Michal Hocko Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- mm/page_alloc.c | 119 ++++++++++++++++++++++++++++++++++++++++++++------------ mm/vmstat.c | 8 ++-- 2 files changed, 98 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 922836b6a01d..83af76a8cef9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2157,6 +2157,40 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, return i; } +/* + * Called from the vmstat counter updater to decay the PCP high. + * Return whether there are addition works to do. + */ +int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) +{ + int high_min, to_drain, batch; + int todo = 0; + + high_min = READ_ONCE(pcp->high_min); + batch = READ_ONCE(pcp->batch); + /* + * Decrease pcp->high periodically to try to free possible + * idle PCP pages. And, avoid to free too many pages to + * control latency. This caps pcp->high decrement too. + */ + if (pcp->high > high_min) { + pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), + pcp->high - (pcp->high >> 3), high_min); + if (pcp->high > high_min) + todo++; + } + + to_drain = pcp->count - pcp->high; + if (to_drain > 0) { + spin_lock(&pcp->lock); + free_pcppages_bulk(zone, to_drain, pcp, 0); + spin_unlock(&pcp->lock); + todo++; + } + + return todo; +} + #ifdef CONFIG_NUMA /* * Called from the vmstat counter updater to drain pagesets of this @@ -2318,14 +2352,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn, return true; } -static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) +static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) { int min_nr_free, max_nr_free; - int batch = READ_ONCE(pcp->batch); - /* Free everything if batch freeing high-order pages. */ + /* Free as much as possible if batch freeing high-order pages. */ if (unlikely(free_high)) - return pcp->count; + return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX); /* Check for PCP disabled or boot pageset */ if (unlikely(high < batch)) @@ -2340,7 +2373,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) * freeing of pages without any allocation. */ batch <<= pcp->free_factor; - if (batch < max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX) + if (batch <= max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX) pcp->free_factor++; batch = clamp(batch, min_nr_free, max_nr_free); @@ -2348,28 +2381,48 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) } static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, - bool free_high) + int batch, bool free_high) { - int high = READ_ONCE(pcp->high_min); + int high, high_min, high_max; + + high_min = READ_ONCE(pcp->high_min); + high_max = READ_ONCE(pcp->high_max); + high = pcp->high = clamp(pcp->high, high_min, high_max); - if (unlikely(!high || free_high)) + if (unlikely(!high)) return 0; - if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) - return high; + if (unlikely(free_high)) { + pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX), + high_min); + return 0; + } /* * If reclaim is active, limit the number of pages that can be * stored on pcp lists */ - return min(READ_ONCE(pcp->batch) << 2, high); + if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) { + pcp->high = max(high - (batch << pcp->free_factor), high_min); + return min(batch << 2, pcp->high); + } + + if (pcp->count >= high && high_min != high_max) { + int need_high = (batch << pcp->free_factor) + batch; + + /* pcp->high should be large enough to hold batch freed pages */ + if (pcp->high < need_high) + pcp->high = clamp(need_high, high_min, high_max); + } + + return high; } static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, unsigned int order) { - int high; + int high, batch; int pindex; bool free_high = false; @@ -2384,6 +2437,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; + batch = READ_ONCE(pcp->batch); /* * As high-order pages other than THP's stored on PCP can contribute * to fragmentation, limit the number stored when PCP is heavily @@ -2394,14 +2448,15 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, free_high = (pcp->free_factor && (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || - pcp->count >= READ_ONCE(pcp->batch))); + pcp->count >= READ_ONCE(batch))); pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; } - high = nr_pcp_high(pcp, zone, free_high); + high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count >= high) { - free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex); + free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), + pcp, pindex); } } @@ -2685,24 +2740,38 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, return page; } -static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order) +static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order) { - int high, batch, max_nr_alloc; + int high, base_batch, batch, max_nr_alloc; + int high_max, high_min; - high = READ_ONCE(pcp->high_min); - batch = READ_ONCE(pcp->batch); + base_batch = READ_ONCE(pcp->batch); + high_min = READ_ONCE(pcp->high_min); + high_max = READ_ONCE(pcp->high_max); + high = pcp->high = clamp(pcp->high, high_min, high_max); /* Check for PCP disabled or boot pageset */ - if (unlikely(high < batch)) + if (unlikely(high < base_batch)) return 1; + if (order) + batch = base_batch; + else + batch = (base_batch << pcp->alloc_factor); + /* - * Double the number of pages allocated each time there is subsequent - * allocation of order-0 pages without any freeing. + * If we had larger pcp->high, we could avoid to allocate from + * zone. */ + if (high_min != high_max && !test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) + high = pcp->high = min(high + batch, high_max); + if (!order) { - max_nr_alloc = max(high - pcp->count - batch, batch); - batch <<= pcp->alloc_factor; + max_nr_alloc = max(high - pcp->count - base_batch, base_batch); + /* + * Double the number of pages allocated each time there is + * subsequent allocation of order-0 pages without any freeing. + */ if (batch <= max_nr_alloc && pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX) pcp->alloc_factor++; @@ -2733,7 +2802,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, do { if (list_empty(list)) { - int batch = nr_pcp_alloc(pcp, order); + int batch = nr_pcp_alloc(pcp, zone, order); int alloced; alloced = rmqueue_bulk(zone, order, diff --git a/mm/vmstat.c b/mm/vmstat.c index 88ea95d4221c..359460deb377 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -816,9 +816,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) for_each_populated_zone(zone) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; -#ifdef CONFIG_NUMA struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset; -#endif for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { int v; @@ -834,10 +832,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets) #endif } } -#ifdef CONFIG_NUMA if (do_pagesets) { cond_resched(); + + changes += decay_pcp_high(zone, this_cpu_ptr(pcp)); +#ifdef CONFIG_NUMA /* * Deal with draining the remote pageset of this * processor @@ -866,8 +866,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets) drain_zone_pages(zone, this_cpu_ptr(pcp)); changes++; } - } #endif + } } for_each_online_pgdat(pgdat) { -- cgit From 57c0419c5f0ea2ccab8700895c8fac20ba1eb21f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:30:01 +0800 Subject: mm, pcp: decrease PCP high if free pages < high watermark One target of PCP is to minimize pages in PCP if the system free pages is too few. To reach that target, when page reclaiming is active for the zone (ZONE_RECLAIM_ACTIVE), we will stop increasing PCP high in allocating path, decrease PCP high and free some pages in freeing path. But this may be too late because the background page reclaiming may introduce latency for some workloads. So, in this patch, during page allocation we will detect whether the number of free pages of the zone is below high watermark. If so, we will stop increasing PCP high in allocating path, decrease PCP high and free some pages in freeing path. With this, we can reduce the possibility of the premature background page reclaiming caused by too large PCP. The high watermark checking is done in allocating path to reduce the overhead in hotter freeing path. Link: https://lkml.kernel.org/r/20231016053002.756205-9-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- mm/page_alloc.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83af76a8cef9..58ab8389da05 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2407,7 +2407,13 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return min(batch << 2, pcp->high); } - if (pcp->count >= high && high_min != high_max) { + if (high_min == high_max) + return high; + + if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) { + pcp->high = max(high - (batch << pcp->free_factor), high_min); + high = max(pcp->count, high_min); + } else if (pcp->count >= high) { int need_high = (batch << pcp->free_factor) + batch; /* pcp->high should be large enough to hold batch freed pages */ @@ -2457,6 +2463,10 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, if (pcp->count >= high) { free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), pcp, pindex); + if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && + zone_watermark_ok(zone, 0, high_wmark_pages(zone), + ZONE_MOVABLE, 0)) + clear_bit(ZONE_BELOW_HIGH, &zone->flags); } } @@ -2763,7 +2773,7 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order) * If we had larger pcp->high, we could avoid to allocate from * zone. */ - if (high_min != high_max && !test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) + if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags)) high = pcp->high = min(high + batch, high_max); if (!order) { @@ -3225,6 +3235,25 @@ retry: } } + /* + * Detect whether the number of free pages is below high + * watermark. If so, we will decrease pcp->high and free + * PCP pages in free path to reduce the possibility of + * premature page reclaiming. Detection is done here to + * avoid to do that in hotter free path. + */ + if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) + goto check_alloc_wmark; + + mark = high_wmark_pages(zone); + if (zone_watermark_fast(zone, order, mark, + ac->highest_zoneidx, alloc_flags, + gfp_mask)) + goto try_this_zone; + else + set_bit(ZONE_BELOW_HIGH, &zone->flags); + +check_alloc_wmark: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (!zone_watermark_fast(zone, order, mark, ac->highest_zoneidx, alloc_flags, -- cgit From 6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:30:02 +0800 Subject: mm, pcp: reduce detecting time of consecutive high order page freeing In current PCP auto-tuning design, if the number of pages allocated is much more than that of pages freed on a CPU, the PCP high may become the maximal value even if the allocating/freeing depth is small, for example, in the sender of network workloads. If a CPU was used as sender originally, then it is used as receiver after context switching, we need to fill the whole PCP with maximal high before triggering PCP draining for consecutive high order freeing. This will hurt the performance of some network workloads. To solve the issue, in this patch, we will track the consecutive page freeing with a counter in stead of relying on PCP draining. So, we can detect consecutive page freeing much earlier. On a 2-socket Intel server with 128 logical CPU, we tested SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes. With the patch, the network bandwidth improves 5.0%. This restores the performance drop caused by PCP auto-tuning. Link: https://lkml.kernel.org/r/20231016053002.756205-10-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- mm/page_alloc.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 58ab8389da05..d52718284029 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2369,13 +2369,10 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free max_nr_free = high - batch; /* - * Double the number of pages freed each time there is subsequent - * freeing of pages without any allocation. + * Increase the batch number to the number of the consecutive + * freed pages to reduce zone lock contention. */ - batch <<= pcp->free_factor; - if (batch <= max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX) - pcp->free_factor++; - batch = clamp(batch, min_nr_free, max_nr_free); + batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free); return batch; } @@ -2403,7 +2400,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, * stored on pcp lists */ if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) { - pcp->high = max(high - (batch << pcp->free_factor), high_min); + int free_count = max_t(int, pcp->free_count, batch); + + pcp->high = max(high - free_count, high_min); return min(batch << 2, pcp->high); } @@ -2411,10 +2410,12 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) { - pcp->high = max(high - (batch << pcp->free_factor), high_min); + int free_count = max_t(int, pcp->free_count, batch); + + pcp->high = max(high - free_count, high_min); high = max(pcp->count, high_min); } else if (pcp->count >= high) { - int need_high = (batch << pcp->free_factor) + batch; + int need_high = pcp->free_count + batch; /* pcp->high should be large enough to hold batch freed pages */ if (pcp->high < need_high) @@ -2451,7 +2452,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, * stops will be drained from vmstat refresh context. */ if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { - free_high = (pcp->free_factor && + free_high = (pcp->free_count >= batch && (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || pcp->count >= READ_ONCE(batch))); @@ -2459,6 +2460,8 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; } + if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX)) + pcp->free_count += (1 << order); high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count >= high) { free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), @@ -2855,7 +2858,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, * See nr_pcp_free() where free_factor is increased for subsequent * frees. */ - pcp->free_factor >>= 1; + pcp->free_count >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); pcp_spin_unlock(pcp); @@ -5488,7 +5491,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta pcp->high_min = BOOT_PAGESET_HIGH; pcp->high_max = BOOT_PAGESET_HIGH; pcp->batch = BOOT_PAGESET_BATCH; - pcp->free_factor = 0; + pcp->free_count = 0; } static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min, -- cgit From 7d0715d0d6b28a831b6fdfefb29c5a7a4929fa49 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:41 -0700 Subject: mm: kmem: optimize get_obj_cgroup_from_current() Patch series "mm: improve performance of accounted kernel memory allocations", v5. This patchset improves the performance of accounted kernel memory allocations by ~30% as measured by a micro-benchmark [1]. The benchmark is very straightforward: 1M of 64 bytes-large kmalloc() allocations. Below are results with the disabled kernel memory accounting, the original state and with this patchset applied. | | Kmem disabled | Original | Patched | Delta | |-------------+---------------+----------+---------+--------| | User cgroup | 29764 | 84548 | 59078 | -30.0% | | Root cgroup | 29742 | 48342 | 31501 | -34.8% | As we can see, the patchset removes the majority of the overhead when there is no actual accounting (a task belongs to the root memory cgroup) and almost halves the accounting overhead otherwise. The main idea is to get rid of unnecessary memcg to objcg conversions and switch to a scope-based protection of objcgs, which eliminates extra operations with objcg reference counters under a rcu read lock. More details are provided in individual commit descriptions. This patch (of 5): Manually inline memcg_kmem_bypass() and active_memcg() to speed up get_obj_cgroup_from_current() by avoiding duplicate in_task() checks and active_memcg() readings. Also add a likely() macro to __get_obj_cgroup_from_memcg(): obj_cgroup_tryget() should succeed at almost all times except a very unlikely race with the memcg deletion path. Link: https://lkml.kernel.org/r/20231019225346.1822282-1-roman.gushchin@linux.dev Link: https://lkml.kernel.org/r/20231019225346.1822282-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Shakeel Butt Acked-by: Johannes Weiner Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a86e7b445800..975ba766c16f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1086,19 +1086,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) } EXPORT_SYMBOL(get_mem_cgroup_from_mm); -static __always_inline bool memcg_kmem_bypass(void) -{ - /* Allow remote memcg charging from any context. */ - if (unlikely(active_memcg())) - return false; - - /* Memcg to charge can't be determined. */ - if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) - return true; - - return false; -} - /** * get_mem_cgroup_from_current - Obtain a reference on current task's memcg. */ @@ -3089,7 +3076,7 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { objcg = rcu_dereference(memcg->objcg); - if (objcg && obj_cgroup_tryget(objcg)) + if (likely(objcg && obj_cgroup_tryget(objcg))) break; objcg = NULL; } @@ -3098,16 +3085,23 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) { - struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg; + struct obj_cgroup *objcg; - if (memcg_kmem_bypass()) - return NULL; + if (in_task()) { + memcg = current->active_memcg; + + /* Memcg to charge can't be determined. */ + if (likely(!memcg) && (!current->mm || (current->flags & PF_KTHREAD))) + return NULL; + } else { + memcg = this_cpu_read(int_active_memcg); + if (likely(!memcg)) + return NULL; + } rcu_read_lock(); - if (unlikely(active_memcg())) - memcg = active_memcg(); - else + if (!memcg) memcg = mem_cgroup_from_task(current); objcg = __get_obj_cgroup_from_memcg(memcg); rcu_read_unlock(); -- cgit From 1aacbd354313f25c855e662e41c04e2abf71444a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:42 -0700 Subject: mm: kmem: add direct objcg pointer to task_struct To charge a freshly allocated kernel object to a memory cgroup, the kernel needs to obtain an objcg pointer. Currently it does it indirectly by obtaining the memcg pointer first and then calling to __get_obj_cgroup_from_memcg(). Usually tasks spend their entire life belonging to the same object cgroup. So it makes sense to save the objcg pointer on task_struct directly, so it can be obtained faster. It requires some work on fork, exit and cgroup migrate paths, but these paths are way colder. To avoid any costly synchronization the following rules are applied: 1) A task sets it's objcg pointer itself. 2) If a task is being migrated to another cgroup, the least significant bit of the objcg pointer is set atomically. 3) On the allocation path the objcg pointer is obtained locklessly using the READ_ONCE() macro and the least significant bit is checked. If it's set, the following procedure is used to update it locklessly: - task->objcg is zeroed using cmpxcg - new objcg pointer is obtained - task->objcg is updated using try_cmpxchg - operation is repeated if try_cmpxcg fails It guarantees that no updates will be lost if task migration is racing against objcg pointer update. It also allows to keep both read and write paths fully lockless. Because the task is keeping a reference to the objcg, it can't go away while the task is alive. This commit doesn't change the way the remote memcg charging works. Link: https://lkml.kernel.org/r/20231019225346.1822282-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Johannes Weiner Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 130 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 975ba766c16f..96f4c319f025 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -249,6 +249,9 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) return container_of(vmpr, struct mem_cgroup, vmpressure); } +#define CURRENT_OBJCG_UPDATE_BIT 0 +#define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT) + #ifdef CONFIG_MEMCG_KMEM static DEFINE_SPINLOCK(objcg_lock); @@ -3083,6 +3086,58 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) return objcg; } +static struct obj_cgroup *current_objcg_update(void) +{ + struct mem_cgroup *memcg; + struct obj_cgroup *old, *objcg = NULL; + + do { + /* Atomically drop the update bit. */ + old = xchg(¤t->objcg, NULL); + if (old) { + old = (struct obj_cgroup *) + ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG); + if (old) + obj_cgroup_put(old); + + old = NULL; + } + + /* If new objcg is NULL, no reason for the second atomic update. */ + if (!current->mm || (current->flags & PF_KTHREAD)) + return NULL; + + /* + * Release the objcg pointer from the previous iteration, + * if try_cmpxcg() below fails. + */ + if (unlikely(objcg)) { + obj_cgroup_put(objcg); + objcg = NULL; + } + + /* + * Obtain the new objcg pointer. The current task can be + * asynchronously moved to another memcg and the previous + * memcg can be offlined. So let's get the memcg pointer + * and try get a reference to objcg under a rcu read lock. + */ + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + objcg = __get_obj_cgroup_from_memcg(memcg); + rcu_read_unlock(); + + /* + * Try set up a new objcg pointer atomically. If it + * fails, it means the update flag was set concurrently, so + * the whole procedure should be repeated. + */ + } while (!try_cmpxchg(¤t->objcg, &old, objcg)); + + return objcg; +} + __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) { struct mem_cgroup *memcg; @@ -3090,19 +3145,26 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) if (in_task()) { memcg = current->active_memcg; + if (unlikely(memcg)) + goto from_memcg; - /* Memcg to charge can't be determined. */ - if (likely(!memcg) && (!current->mm || (current->flags & PF_KTHREAD))) - return NULL; + objcg = READ_ONCE(current->objcg); + if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG)) + objcg = current_objcg_update(); + + if (objcg) { + obj_cgroup_get(objcg); + return objcg; + } } else { memcg = this_cpu_read(int_active_memcg); - if (likely(!memcg)) - return NULL; + if (unlikely(memcg)) + goto from_memcg; } + return NULL; +from_memcg: rcu_read_lock(); - if (!memcg) - memcg = mem_cgroup_from_task(current); objcg = __get_obj_cgroup_from_memcg(memcg); rcu_read_unlock(); return objcg; @@ -6440,6 +6502,7 @@ static void mem_cgroup_move_task(void) mem_cgroup_clear_mc(); } } + #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { @@ -6453,8 +6516,39 @@ static void mem_cgroup_move_task(void) } #endif +#ifdef CONFIG_MEMCG_KMEM +static void mem_cgroup_fork(struct task_struct *task) +{ + /* + * Set the update flag to cause task->objcg to be initialized lazily + * on the first allocation. It can be done without any synchronization + * because it's always performed on the current task, so does + * current_objcg_update(). + */ + task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG; +} + +static void mem_cgroup_exit(struct task_struct *task) +{ + struct obj_cgroup *objcg = task->objcg; + + objcg = (struct obj_cgroup *) + ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG); + if (objcg) + obj_cgroup_put(objcg); + + /* + * Some kernel allocations can happen after this point, + * but let's ignore them. It can be done without any synchronization + * because it's always performed on the current task, so does + * current_objcg_update(). + */ + task->objcg = NULL; +} +#endif + #ifdef CONFIG_LRU_GEN -static void mem_cgroup_attach(struct cgroup_taskset *tset) +static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; @@ -6472,10 +6566,31 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset) task_unlock(task); } #else +static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {} +#endif /* CONFIG_LRU_GEN */ + +#ifdef CONFIG_MEMCG_KMEM +static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(task, css, tset) { + /* atomically set the update bit */ + set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg); + } +} +#else +static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) {} +#endif /* CONFIG_MEMCG_KMEM */ + +#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) static void mem_cgroup_attach(struct cgroup_taskset *tset) { + mem_cgroup_lru_gen_attach(tset); + mem_cgroup_kmem_attach(tset); } -#endif /* CONFIG_LRU_GEN */ +#endif static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { @@ -6885,9 +7000,15 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, +#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) .attach = mem_cgroup_attach, +#endif .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, +#ifdef CONFIG_MEMCG_KMEM + .fork = mem_cgroup_fork, + .exit = mem_cgroup_exit, +#endif .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, -- cgit From 675d6c9b59e313ca2573c93e8fd87011a99bb8ce Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:43 -0700 Subject: mm: kmem: make memcg keep a reference to the original objcg Keep a reference to the original objcg object for the entire life of a memcg structure. This allows to simplify the synchronization on the kernel memory allocation paths: pinning a (live) memcg will also pin the corresponding objcg. The memory overhead of this change is minimal because object cgroups usually outlive their corresponding memory cgroups even without this change, so it's only an additional pointer per memcg. Link: https://lkml.kernel.org/r/20231019225346.1822282-4-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 96f4c319f025..ff036d5d339d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3899,6 +3899,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) objcg->memcg = memcg; rcu_assign_pointer(memcg->objcg, objcg); + obj_cgroup_get(objcg); + memcg->orig_objcg = objcg; static_branch_enable(&memcg_kmem_online_key); @@ -5406,6 +5408,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + if (memcg->orig_objcg) + obj_cgroup_put(memcg->orig_objcg); + for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); kfree(memcg->vmstats); -- cgit From e86828e5446d95676835679837d995dec188d2be Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:44 -0700 Subject: mm: kmem: scoped objcg protection Switch to a scope-based protection of the objcg pointer on slab/kmem allocation paths. Instead of using the get_() semantics in the pre-allocation hook and put the reference afterwards, let's rely on the fact that objcg is pinned by the scope. It's possible because: 1) if the objcg is received from the current task struct, the task is keeping a reference to the objcg. 2) if the objcg is received from an active memcg (remote charging), the memcg is pinned by the scope and has a reference to the corresponding objcg. Link: https://lkml.kernel.org/r/20231019225346.1822282-5-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 47 +++++++++++++++++++++++++++++++++++++++++++++-- mm/slab.h | 15 ++++++++------- 2 files changed, 53 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ff036d5d339d..a6457c8b5e16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3170,6 +3170,49 @@ from_memcg: return objcg; } +__always_inline struct obj_cgroup *current_obj_cgroup(void) +{ + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; + + if (in_task()) { + memcg = current->active_memcg; + if (unlikely(memcg)) + goto from_memcg; + + objcg = READ_ONCE(current->objcg); + if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG)) + objcg = current_objcg_update(); + /* + * Objcg reference is kept by the task, so it's safe + * to use the objcg by the current task. + */ + return objcg; + } + + memcg = this_cpu_read(int_active_memcg); + if (unlikely(memcg)) + goto from_memcg; + + return NULL; + +from_memcg: + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { + /* + * Memcg pointer is protected by scope (see set_active_memcg()) + * and is pinning the corresponding objcg, so objcg can't go + * away and can be used within the scope without any additional + * protection. + */ + objcg = rcu_dereference_check(memcg->objcg, 1); + if (likely(objcg)) + break; + objcg = NULL; + } + + return objcg; +} + struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { struct obj_cgroup *objcg; @@ -3264,15 +3307,15 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) struct obj_cgroup *objcg; int ret = 0; - objcg = get_obj_cgroup_from_current(); + objcg = current_obj_cgroup(); if (objcg) { ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { + obj_cgroup_get(objcg); page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM; return 0; } - obj_cgroup_put(objcg); } return ret; } diff --git a/mm/slab.h b/mm/slab.h index 799a315695c6..3d07fb428393 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -484,7 +484,12 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) return true; - objcg = get_obj_cgroup_from_current(); + /* + * The obtained objcg pointer is safe to use within the current scope, + * defined by current task or set_active_memcg() pair. + * obj_cgroup_get() is used to get a permanent reference. + */ + objcg = current_obj_cgroup(); if (!objcg) return true; @@ -497,17 +502,14 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, css_put(&memcg->css); if (ret) - goto out; + return false; } if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) - goto out; + return false; *objcgp = objcg; return true; -out: - obj_cgroup_put(objcg); - return false; } static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, @@ -542,7 +544,6 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, obj_cgroup_uncharge(objcg, obj_full_size(s)); } } - obj_cgroup_put(objcg); } static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, -- cgit From c63b835d0eafc956c43b8c6605708240ac52b8cd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:45 -0700 Subject: percpu: scoped objcg protection Similar to slab and kmem, switch to a scope-based protection of the objcg pointer to avoid. Link: https://lkml.kernel.org/r/20231019225346.1822282-6-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/percpu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index a7665de8485f..f53ba692d67a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1628,14 +1628,12 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT)) return true; - objcg = get_obj_cgroup_from_current(); + objcg = current_obj_cgroup(); if (!objcg) return true; - if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { - obj_cgroup_put(objcg); + if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) return false; - } *objcgp = objcg; return true; @@ -1649,6 +1647,7 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, return; if (likely(chunk && chunk->obj_cgroups)) { + obj_cgroup_get(objcg); chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; rcu_read_lock(); @@ -1657,7 +1656,6 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, rcu_read_unlock(); } else { obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); - obj_cgroup_put(objcg); } } -- cgit From e56808fef8f71a192b2740c0b6ea8be7ab865d54 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:46 -0700 Subject: mm: kmem: reimplement get_obj_cgroup_from_current() Reimplement get_obj_cgroup_from_current() using current_obj_cgroup(). get_obj_cgroup_from_current() and current_obj_cgroup() share 80% of the code, so the new implementation is almost trivial. get_obj_cgroup_from_current() is a convenient function used by the bpf subsystem, so there is no reason to get rid of it completely. Link: https://lkml.kernel.org/r/20231019225346.1822282-7-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Reviewed-by: Vlastimil Babka Acked-by: Shakeel Butt Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Naresh Kamboju Signed-off-by: Andrew Morton --- mm/memcontrol.c | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a6457c8b5e16..8b0859b8cc03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3138,38 +3138,6 @@ static struct obj_cgroup *current_objcg_update(void) return objcg; } -__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) -{ - struct mem_cgroup *memcg; - struct obj_cgroup *objcg; - - if (in_task()) { - memcg = current->active_memcg; - if (unlikely(memcg)) - goto from_memcg; - - objcg = READ_ONCE(current->objcg); - if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG)) - objcg = current_objcg_update(); - - if (objcg) { - obj_cgroup_get(objcg); - return objcg; - } - } else { - memcg = this_cpu_read(int_active_memcg); - if (unlikely(memcg)) - goto from_memcg; - } - return NULL; - -from_memcg: - rcu_read_lock(); - objcg = __get_obj_cgroup_from_memcg(memcg); - rcu_read_unlock(); - return objcg; -} - __always_inline struct obj_cgroup *current_obj_cgroup(void) { struct mem_cgroup *memcg; -- cgit From e5b306a082982cb98bc7ec48a382225522401a61 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 17 Oct 2023 09:17:28 +0800 Subject: mm/swap: avoid a xa load for swapout path A variable is never used for swapout path (shadowp is NULL) and compiler is unable to optimize out the unneeded load since it's a function call. The was introduced by 3852f6768ede ("mm/swapcache: support to handle the shadow entries"). Link: https://lkml.kernel.org/r/20231017011728.37508-1-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Matthew Wilcox (Oracle) Cc: Huang Ying Signed-off-by: Andrew Morton --- mm/swap_state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index b3b14bd0dd64..ab79ffb71736 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -109,9 +109,9 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, goto unlock; for (i = 0; i < nr; i++) { VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); - old = xas_load(&xas); - if (xa_is_value(old)) { - if (shadowp) + if (shadowp) { + old = xas_load(&xas); + if (xa_is_value(old)) *shadowp = old; } xas_store(&xas, folio); -- cgit From 67b33e3ff58374b3fca929933ccc04a1858fda6a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:50 +0800 Subject: mm: memory: use folio_last_cpupid() in do_numa_page() Convert to use folio_last_cpupid() in do_numa_page(). Link: https://lkml.kernel.org/r/20231018140806.2783514-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 1f88e4f6fbf2..0915eac772fe 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4858,7 +4858,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) !node_is_toptier(nid)) last_cpupid = (-1 & LAST_CPUPID_MASK); else - last_cpupid = page_cpupid_last(&folio->page); + last_cpupid = folio_last_cpupid(folio); target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); -- cgit From c4a8d2faab1f9165df1543795254b1c2470ce7f8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:51 +0800 Subject: mm: huge_memory: use folio_last_cpupid() in do_huge_pmd_numa_page() Convert to use folio_last_cpupid() in do_huge_pmd_numa_page(). Link: https://lkml.kernel.org/r/20231018140806.2783514-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c9cbcbf6697e..f9571bf92603 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1562,7 +1562,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) * to record page access time. So use default value. */ if (node_is_toptier(nid)) - last_cpupid = page_cpupid_last(&folio->page); + last_cpupid = folio_last_cpupid(folio); target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); -- cgit From 19c1ac02ce02158fa22eb53f2750525ae93da9ef Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:52 +0800 Subject: mm: huge_memory: use folio_last_cpupid() in __split_huge_page_tail() Convert to use folio_last_cpupid() in __split_huge_page_tail(). Link: https://lkml.kernel.org/r/20231018140806.2783514-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9571bf92603..5455dfe4c3c7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2514,7 +2514,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, if (page_is_idle(head)) set_page_idle(page_tail); - page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + page_cpupid_xchg_last(page_tail, folio_last_cpupid(folio)); /* * always add to the tail because some iterators expect new -- cgit From ec1778807a8053d14cde7cfd75fbd66e0c7b9c9f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:56 +0800 Subject: mm: mprotect: use a folio in change_pte_range() Use a folio in change_pte_range() to save three compound_head() calls. Since now only normal and PMD-mapped page is handled by numa balancing, it is enough to only update the entire folio's access time. Link: https://lkml.kernel.org/r/20231018140806.2783514-10-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/mprotect.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index f1dc8f8c84ef..81991102f785 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -114,7 +114,7 @@ static long change_pte_range(struct mmu_gather *tlb, * pages. See similar comment in change_huge_pmd. */ if (prot_numa) { - struct page *page; + struct folio *folio; int nid; bool toptier; @@ -122,13 +122,14 @@ static long change_pte_range(struct mmu_gather *tlb, if (pte_protnone(oldpte)) continue; - page = vm_normal_page(vma, addr, oldpte); - if (!page || is_zone_device_page(page) || PageKsm(page)) + folio = vm_normal_folio(vma, addr, oldpte); + if (!folio || folio_is_zone_device(folio) || + folio_test_ksm(folio)) continue; /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && - page_count(page) != 1) + folio_ref_count(folio) != 1) continue; /* @@ -136,14 +137,15 @@ static long change_pte_range(struct mmu_gather *tlb, * it cannot move them all from MIGRATE_ASYNC * context. */ - if (page_is_file_lru(page) && PageDirty(page)) + if (folio_is_file_lru(folio) && + folio_test_dirty(folio)) continue; /* * Don't mess with PTEs if page is already on the node * a single-threaded process is running on. */ - nid = page_to_nid(page); + nid = folio_nid(folio); if (target_node == nid) continue; toptier = node_is_toptier(nid); @@ -157,7 +159,7 @@ static long change_pte_range(struct mmu_gather *tlb, continue; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && !toptier) - xchg_page_access_time(page, + folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); } -- cgit From d986ba2b1953f761d3859c22160e82c58ed4287d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:57 +0800 Subject: mm: huge_memory: use a folio in change_huge_pmd() Use a folio in change_huge_pmd(), which helps to remove last xchg_page_access_time() caller. Link: https://lkml.kernel.org/r/20231018140806.2783514-11-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5455dfe4c3c7..f01f345141da 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1856,7 +1856,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); @@ -1865,7 +1865,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * A protection check is difficult so * just be safe and disable write */ - if (PageAnon(page)) + if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry(swp_offset(entry)); else entry = make_readable_migration_entry(swp_offset(entry)); @@ -1887,7 +1887,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, #endif if (prot_numa) { - struct page *page; + struct folio *folio; bool toptier; /* * Avoid trapping faults against the zero page. The read-only @@ -1900,8 +1900,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_protnone(*pmd)) goto unlock; - page = pmd_page(*pmd); - toptier = node_is_toptier(page_to_nid(page)); + folio = page_folio(pmd_page(*pmd)); + toptier = node_is_toptier(folio_nid(folio)); /* * Skip scanning top tier node if normal numa * balancing is disabled @@ -1912,7 +1912,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && !toptier) - xchg_page_access_time(page, jiffies_to_msecs(jiffies)); + folio_xchg_access_time(folio, + jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical -- cgit From 4e694fe4d2fa3031392bdbeaa88066f67c886a0c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:01 +0800 Subject: mm: migrate: use folio_xchg_last_cpupid() in folio_migrate_flags() Convert to use folio_xchg_last_cpupid() in folio_migrate_flags(), also directly use folio_nid() instead of page_to_nid(&folio->page). Link: https://lkml.kernel.org/r/20231018140806.2783514-15-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index c602bf6dec97..71fc5064cec6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -588,20 +588,20 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) * Copy NUMA information to the new page, to prevent over-eager * future migrations of this same page. */ - cpupid = page_cpupid_xchg_last(&folio->page, -1); + cpupid = folio_xchg_last_cpupid(folio, -1); /* * For memory tiering mode, when migrate between slow and fast * memory node, reset cpupid, because that is used to record * page access time in slow memory node. */ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { - bool f_toptier = node_is_toptier(page_to_nid(&folio->page)); - bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page)); + bool f_toptier = node_is_toptier(folio_nid(folio)); + bool t_toptier = node_is_toptier(folio_nid(newfolio)); if (f_toptier != t_toptier) cpupid = -1; } - page_cpupid_xchg_last(&newfolio->page, cpupid); + folio_xchg_last_cpupid(newfolio, cpupid); folio_migrate_ksm(newfolio, folio); /* -- cgit From c82530113480f8db9dd9584c51ec9326e6ce9790 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:02 +0800 Subject: mm: huge_memory: use folio_xchg_last_cpupid() in __split_huge_page_tail() Convert to use folio_xchg_last_cpupid() in __split_huge_page_tail(). Link: https://lkml.kernel.org/r/20231018140806.2783514-16-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f01f345141da..f31f02472396 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2515,7 +2515,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, if (page_is_idle(head)) set_page_idle(page_tail); - page_cpupid_xchg_last(page_tail, folio_last_cpupid(folio)); + folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); /* * always add to the tail because some iterators expect new -- cgit From c08b7e3830dbf24dd2552ddeea84f00393842f1b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:03 +0800 Subject: mm: make finish_mkwrite_fault() static Make finish_mkwrite_fault static since it is not used outside of memory.c. Link: https://lkml.kernel.org/r/20231018140806.2783514-17-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 0915eac772fe..2b6e2b239411 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3269,7 +3269,7 @@ out: * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ -vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) +static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, -- cgit From a86bc96b77df40c27ead5ef4ac3837904b7eb53f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:04 +0800 Subject: mm: convert wp_page_reuse() and finish_mkwrite_fault() to take a folio Saves one compound_head() call, also in preparation for page_cpupid_xchg_last() conversion. Link: https://lkml.kernel.org/r/20231018140806.2783514-18-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 2b6e2b239411..94791c5e2951 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3015,7 +3015,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline void wp_page_reuse(struct vm_fault *vmf) +static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -3023,7 +3023,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page)); + VM_BUG_ON(folio && folio_test_anon(folio) && !PageAnonExclusive(page)); /* * Clear the pages cpupid information as the existing @@ -3258,6 +3258,7 @@ out: * writeable once the page is prepared * * @vmf: structure describing the fault + * @folio: the folio of vmf->page * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. @@ -3269,7 +3270,7 @@ out: * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ -static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) +static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, @@ -3285,7 +3286,7 @@ static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } - wp_page_reuse(vmf); + wp_page_reuse(vmf, folio); return 0; } @@ -3309,9 +3310,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; - return finish_mkwrite_fault(vmf); + return finish_mkwrite_fault(vmf, NULL); } - wp_page_reuse(vmf); + wp_page_reuse(vmf, NULL); return 0; } @@ -3339,14 +3340,14 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) folio_put(folio); return tmp; } - tmp = finish_mkwrite_fault(vmf); + tmp = finish_mkwrite_fault(vmf, folio); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { folio_unlock(folio); folio_put(folio); return tmp; } } else { - wp_page_reuse(vmf); + wp_page_reuse(vmf, folio); folio_lock(folio); } ret |= fault_dirty_shared_page(vmf); @@ -3491,7 +3492,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } - wp_page_reuse(vmf); + wp_page_reuse(vmf, folio); return 0; } /* -- cgit From c2c3b5148052cef670d359b81d338d20b96bf47f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:05 +0800 Subject: mm: use folio_xchg_last_cpupid() in wp_page_reuse() Convert to use folio_xchg_last_cpupid() in wp_page_reuse(), and remove page variable. Since now only normal and PMD-mapped page is handled by numa balancing, it's enough to only update the entire folio's last cpupid. Link: https://lkml.kernel.org/r/20231018140806.2783514-19-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 94791c5e2951..1f18ed4a5497 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3019,19 +3019,20 @@ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - struct page *page = vmf->page; pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(folio && folio_test_anon(folio) && !PageAnonExclusive(page)); - /* - * Clear the pages cpupid information as the existing - * information potentially belongs to a now completely - * unrelated process. - */ - if (page) - page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); + if (folio) { + VM_BUG_ON(folio_test_anon(folio) && + !PageAnonExclusive(vmf->page)); + /* + * Clear the folio's cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1); + } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); -- cgit From 8f0f4788b1247c2f92ecacd8f86ce0b379b807b9 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:06 +0800 Subject: mm: remove page_cpupid_xchg_last() Since all calls use folio_xchg_last_cpupid(), remove page_cpupid_xchg_last(). Link: https://lkml.kernel.org/r/20231018140806.2783514-20-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/mmzone.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmzone.c b/mm/mmzone.c index 68e1511be12d..b594d3f268fe 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -93,19 +93,19 @@ void lruvec_init(struct lruvec *lruvec) } #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) -int page_cpupid_xchg_last(struct page *page, int cpupid) +int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { unsigned long old_flags, flags; int last_cpupid; - old_flags = READ_ONCE(page->flags); + old_flags = READ_ONCE(folio->flags); do { flags = old_flags; last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags))); return last_cpupid; } -- cgit From 6d4e2cda62af38f7c21052b6847ffff08ea7173f Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:46 +0800 Subject: bootmem: use kmemleak_free_part_phys in put_page_bootmem Patch series "Some bugfix about kmemleak", v3. Some bugfixes for kmemleak and the printed info from debug mode. This patch (of 7): Since kmemleak_alloc_phys() rather than kmemleak_alloc() was called from memblock_alloc_range_nid(), kmemleak_free_part_phys() should be used to delete kmemleak object in put_page_bootmem(). In debug mode, there are following warning: kmemleak: Partially freeing unknown object at 0xffff97345aff7000 (size 4096) Link: https://lkml.kernel.org/r/20231018102952.3339837-1-liushixin2@huawei.com Link: https://lkml.kernel.org/r/20231018102952.3339837-2-liushixin2@huawei.com Fixes: dd0ff4d12dd2 ("bootmem: remove the vmemmap pages from kmemleak in put_page_bootmem") Signed-off-by: Liu Shixin Acked-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/bootmem_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index b1efebfcf94b..fa7cb0c87c03 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -34,7 +34,7 @@ void put_page_bootmem(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); - kmemleak_free_part(page_to_virt(page), PAGE_SIZE); + kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); } } -- cgit From 62047e0f3e3a707599afe6d43cf684a2a02d05d5 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:48 +0800 Subject: mm/kmemleak: fix print format of pointer in pr_debug() With 0x%p, the pointer will be hashed and print (____ptrval____) instead. And with 0x%pa, the pointer can be successfully printed but with duplicate prefixes, which looks like: kmemleak: kmemleak_free(0x(____ptrval____)) kmemleak: kmemleak_free_percpu(0x(____ptrval____)) kmemleak: kmemleak_free_part_phys(0x0x0000000a1af86000) Use 0x%px instead of 0x%p or 0x%pa to print the pointer. Then the print will be like: kmemleak: kmemleak_free(0xffff9111c145b020) kmemleak: kmemleak_free_percpu(0x00000000000333b0) kmemleak: kmemleak_free_part_phys(0x0000000a1af80000) Link: https://lkml.kernel.org/r/20231018102952.3339837-4-liushixin2@huawei.com Signed-off-by: Liu Shixin Acked-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 54c2c90d3abc..289b3be5ee6e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -975,7 +975,7 @@ static void object_no_scan(unsigned long ptr) void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) { - pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); + pr_debug("%s(0x%px, %zu, %d)\n", __func__, ptr, size, min_count); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); @@ -996,7 +996,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, { unsigned int cpu; - pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size); + pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size); /* * Percpu allocations are only scanned and not reported as leaks @@ -1020,7 +1020,7 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); */ void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp) { - pr_debug("%s(0x%p, %zu)\n", __func__, area, size); + pr_debug("%s(0x%px, %zu)\n", __func__, area, size); /* * A min_count = 2 is needed because vm_struct contains a reference to @@ -1043,7 +1043,7 @@ EXPORT_SYMBOL_GPL(kmemleak_vmalloc); */ void __ref kmemleak_free(const void *ptr) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); @@ -1061,7 +1061,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free); */ void __ref kmemleak_free_part(const void *ptr, size_t size) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size, false); @@ -1079,7 +1079,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) { unsigned int cpu; - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) @@ -1100,7 +1100,7 @@ void __ref kmemleak_update_trace(const void *ptr) struct kmemleak_object *object; unsigned long flags; - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr)) return; @@ -1131,7 +1131,7 @@ EXPORT_SYMBOL(kmemleak_update_trace); */ void __ref kmemleak_not_leak(const void *ptr) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); @@ -1149,7 +1149,7 @@ EXPORT_SYMBOL(kmemleak_not_leak); */ void __ref kmemleak_ignore(const void *ptr) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr, false); @@ -1169,7 +1169,7 @@ EXPORT_SYMBOL(kmemleak_ignore); */ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); @@ -1187,7 +1187,7 @@ EXPORT_SYMBOL(kmemleak_scan_area); */ void __ref kmemleak_no_scan(const void *ptr) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); @@ -1203,7 +1203,7 @@ EXPORT_SYMBOL(kmemleak_no_scan); */ void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) { - pr_debug("%s(0x%pa, %zu)\n", __func__, &phys, size); + pr_debug("%s(0x%px, %zu)\n", __func__, &phys, size); if (kmemleak_enabled) /* @@ -1223,7 +1223,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); */ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) { - pr_debug("%s(0x%pa)\n", __func__, &phys); + pr_debug("%s(0x%px)\n", __func__, &phys); if (kmemleak_enabled) delete_object_part((unsigned long)phys, size, true); @@ -1237,7 +1237,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); */ void __ref kmemleak_ignore_phys(phys_addr_t phys) { - pr_debug("%s(0x%pa)\n", __func__, &phys); + pr_debug("%s(0x%px)\n", __func__, &phys); if (kmemleak_enabled) make_black_object((unsigned long)phys, true); -- cgit From 0edd7b58293340d26380d7510fd4fc08e5902511 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:49 +0800 Subject: mm: kmemleak: split __create_object into two functions __create_object() consists of two part, the first part allocate a kmemleak object and initialize it, the second part insert it into object tree. This function need kmemleak_lock but actually only the second part need lock. Split it into two functions, the first function __alloc_object only allocate a kmemleak object, and the second function __link_object() will initialize the object and insert it into object tree, use the kmemleak_lock to protect __link_object() only. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20231018102952.3339837-5-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 61 +++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 289b3be5ee6e..39732dcaf45b 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -623,25 +623,15 @@ static noinline depot_stack_handle_t set_track_prepare(void) return trace_handle; } -/* - * Create the metadata (struct kmemleak_object) corresponding to an allocated - * memory block and add it to the object_list and object_tree_root (or - * object_phys_tree_root). - */ -static void __create_object(unsigned long ptr, size_t size, - int min_count, gfp_t gfp, bool is_phys) +static struct kmemleak_object *__alloc_object(gfp_t gfp) { - unsigned long flags; - struct kmemleak_object *object, *parent; - struct rb_node **link, *rb_parent; - unsigned long untagged_ptr; - unsigned long untagged_objp; + struct kmemleak_object *object; object = mem_pool_alloc(gfp); if (!object) { pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); - return; + return NULL; } INIT_LIST_HEAD(&object->object_list); @@ -649,13 +639,8 @@ static void __create_object(unsigned long ptr, size_t size, INIT_HLIST_HEAD(&object->area_list); raw_spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); - object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); - object->pointer = ptr; - object->size = kfence_ksize((void *)ptr) ?: size; object->excess_ref = 0; - object->min_count = min_count; object->count = 0; /* white color initially */ - object->jiffies = jiffies; object->checksum = 0; object->del_state = 0; @@ -680,7 +665,23 @@ static void __create_object(unsigned long ptr, size_t size, /* kernel backtrace */ object->trace_handle = set_track_prepare(); - raw_spin_lock_irqsave(&kmemleak_lock, flags); + return object; +} + +static void __link_object(struct kmemleak_object *object, unsigned long ptr, + size_t size, int min_count, bool is_phys) +{ + + struct kmemleak_object *parent; + struct rb_node **link, *rb_parent; + unsigned long untagged_ptr; + unsigned long untagged_objp; + + object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); + object->pointer = ptr; + object->size = kfence_ksize((void *)ptr) ?: size; + object->min_count = min_count; + object->jiffies = jiffies; untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); /* @@ -711,14 +712,32 @@ static void __create_object(unsigned long ptr, size_t size, */ dump_object_info(parent); kmem_cache_free(object_cache, object); - goto out; + return; } } rb_link_node(&object->rb_node, rb_parent, link); rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root : &object_tree_root); list_add_tail_rcu(&object->object_list, &object_list); -out: +} + +/* + * Create the metadata (struct kmemleak_object) corresponding to an allocated + * memory block and add it to the object_list and object_tree_root (or + * object_phys_tree_root). + */ +static void __create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp, bool is_phys) +{ + struct kmemleak_object *object; + unsigned long flags; + + object = __alloc_object(gfp); + if (!object) + return; + + raw_spin_lock_irqsave(&kmemleak_lock, flags); + __link_object(object, ptr, size, min_count, is_phys); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); } -- cgit From 2e1d47385f98aa0fa7d71aeee32d26cc6f8493e0 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:50 +0800 Subject: mm: kmemleak: use mem_pool_free() to free object The kmemleak object is allocated by mem_pool_alloc(), which could be from slab or mem_pool[], so it's not suitable using __kmem_cache_free() to free the object, use __mem_pool_free() instead. Link: https://lkml.kernel.org/r/20231018102952.3339837-6-liushixin2@huawei.com Fixes: 0647398a8c7b ("mm: kmemleak: simple memory allocation pool for kmemleak objects") Signed-off-by: Liu Shixin Reviewed-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 39732dcaf45b..82322b029f1c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -668,8 +668,8 @@ static struct kmemleak_object *__alloc_object(gfp_t gfp) return object; } -static void __link_object(struct kmemleak_object *object, unsigned long ptr, - size_t size, int min_count, bool is_phys) +static int __link_object(struct kmemleak_object *object, unsigned long ptr, + size_t size, int min_count, bool is_phys) { struct kmemleak_object *parent; @@ -711,14 +711,15 @@ static void __link_object(struct kmemleak_object *object, unsigned long ptr, * be freed while the kmemleak_lock is held. */ dump_object_info(parent); - kmem_cache_free(object_cache, object); - return; + return -EEXIST; } } rb_link_node(&object->rb_node, rb_parent, link); rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root : &object_tree_root); list_add_tail_rcu(&object->object_list, &object_list); + + return 0; } /* @@ -731,14 +732,17 @@ static void __create_object(unsigned long ptr, size_t size, { struct kmemleak_object *object; unsigned long flags; + int ret; object = __alloc_object(gfp); if (!object) return; raw_spin_lock_irqsave(&kmemleak_lock, flags); - __link_object(object, ptr, size, min_count, is_phys); + ret = __link_object(object, ptr, size, min_count, is_phys); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + if (ret) + mem_pool_free(object); } /* Create kmemleak object which allocated with virtual address. */ -- cgit From 858a195b9330d0bd36f59e8652f693a1beb7da2f Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:51 +0800 Subject: mm: kmemleak: add __find_and_remove_object() Add new __find_and_remove_object() without kmemleak_lock protect, it is in preparation for the next patch. Link: https://lkml.kernel.org/r/20231018102952.3339837-7-liushixin2@huawei.com Signed-off-by: Liu Shixin Acked-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 82322b029f1c..f63c220eb49e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -583,6 +583,19 @@ static void __remove_object(struct kmemleak_object *object) object->del_state |= DELSTATE_REMOVED; } +static struct kmemleak_object *__find_and_remove_object(unsigned long ptr, + int alias, + bool is_phys) +{ + struct kmemleak_object *object; + + object = __lookup_object(ptr, alias, is_phys); + if (object) + __remove_object(object); + + return object; +} + /* * Look up an object in the object search tree and remove it from both * object_tree_root (or object_phys_tree_root) and object_list. The @@ -596,9 +609,7 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali struct kmemleak_object *object; raw_spin_lock_irqsave(&kmemleak_lock, flags); - object = __lookup_object(ptr, alias, is_phys); - if (object) - __remove_object(object); + object = __find_and_remove_object(ptr, alias, is_phys); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); return object; -- cgit From 5e4fc577db2514fee3cc2729415d0e2589d5d056 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:52 +0800 Subject: mm/kmemleak: fix partially freeing unknown object warning delete_object_part() can be called by multiple callers in the same time. If an object is found and removed by a caller, and then another caller try to find it too, it failed and return directly. It still be recorded by kmemleak even if it has already been freed to buddy. With DEBUG on, kmemleak will report the following warning, kmemleak: Partially freeing unknown object at 0xa1af86000 (size 4096) CPU: 0 PID: 742 Comm: test_huge Not tainted 6.6.0-rc3kmemleak+ #54 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: dump_stack_lvl+0x37/0x50 kmemleak_free_part_phys+0x50/0x60 hugetlb_vmemmap_optimize+0x172/0x290 ? __pfx_vmemmap_remap_pte+0x10/0x10 __prep_new_hugetlb_folio+0xe/0x30 prep_new_hugetlb_folio.isra.0+0xe/0x40 alloc_fresh_hugetlb_folio+0xc3/0xd0 alloc_surplus_hugetlb_folio.constprop.0+0x6e/0xd0 hugetlb_acct_memory.part.0+0xe6/0x2a0 hugetlb_reserve_pages+0x110/0x2c0 hugetlbfs_file_mmap+0x11d/0x1b0 mmap_region+0x248/0x9a0 ? hugetlb_get_unmapped_area+0x15c/0x2d0 do_mmap+0x38b/0x580 vm_mmap_pgoff+0xe6/0x190 ksys_mmap_pgoff+0x18a/0x1f0 do_syscall_64+0x3f/0x90 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Expand __create_object() and move __alloc_object() to the beginning. Then use kmemleak_lock to protect __find_and_remove_object() and __link_object() as a whole, which can guarantee all objects are processed sequentialally. Link: https://lkml.kernel.org/r/20231018102952.3339837-8-liushixin2@huawei.com Fixes: 53238a60dd4a ("kmemleak: Allow partial freeing of memory blocks") Signed-off-by: Liu Shixin Reviewed-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f63c220eb49e..22bab3738a9e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -816,16 +816,25 @@ static void delete_object_full(unsigned long ptr) */ static void delete_object_part(unsigned long ptr, size_t size, bool is_phys) { - struct kmemleak_object *object; - unsigned long start, end; + struct kmemleak_object *object, *object_l, *object_r; + unsigned long start, end, flags; + + object_l = __alloc_object(GFP_KERNEL); + if (!object_l) + return; - object = find_and_remove_object(ptr, 1, is_phys); + object_r = __alloc_object(GFP_KERNEL); + if (!object_r) + goto out; + + raw_spin_lock_irqsave(&kmemleak_lock, flags); + object = __find_and_remove_object(ptr, 1, is_phys); if (!object) { #ifdef DEBUG kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", ptr, size); #endif - return; + goto unlock; } /* @@ -835,14 +844,25 @@ static void delete_object_part(unsigned long ptr, size_t size, bool is_phys) */ start = object->pointer; end = object->pointer + object->size; - if (ptr > start) - __create_object(start, ptr - start, object->min_count, - GFP_KERNEL, is_phys); - if (ptr + size < end) - __create_object(ptr + size, end - ptr - size, object->min_count, - GFP_KERNEL, is_phys); + if ((ptr > start) && + !__link_object(object_l, start, ptr - start, + object->min_count, is_phys)) + object_l = NULL; + if ((ptr + size < end) && + !__link_object(object_r, ptr + size, end - ptr - size, + object->min_count, is_phys)) + object_r = NULL; + +unlock: + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + if (object) + __delete_object(object); - __delete_object(object); +out: + if (object_l) + mem_pool_free(object_l); + if (object_r) + mem_pool_free(object_r); } static void __paint_it(struct kmemleak_object *object, int color) -- cgit From 245245c2fffd0050772a3f30ba50e2be92537a32 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 23 Oct 2023 10:51:25 +0800 Subject: mm/kmemleak: move the initialisation of object to __link_object In patch (mm: kmemleak: split __create_object into two functions), the initialisation of object has been splited in two places. Catalin said it feels a bit weird and error prone. So leave __alloc_object() to just do the actual allocation and let __link_object() do the full initialisation. Link: https://lkml.kernel.org/r/20231023025125.90972-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Suggested-by: Catalin Marinas Signed-off-by: Andrew Morton --- mm/kmemleak.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 22bab3738a9e..1eacca03bedd 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -642,16 +642,32 @@ static struct kmemleak_object *__alloc_object(gfp_t gfp) if (!object) { pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); - return NULL; } + return object; +} + +static int __link_object(struct kmemleak_object *object, unsigned long ptr, + size_t size, int min_count, bool is_phys) +{ + + struct kmemleak_object *parent; + struct rb_node **link, *rb_parent; + unsigned long untagged_ptr; + unsigned long untagged_objp; + INIT_LIST_HEAD(&object->object_list); INIT_LIST_HEAD(&object->gray_list); INIT_HLIST_HEAD(&object->area_list); raw_spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); + object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); + object->pointer = ptr; + object->size = kfence_ksize((void *)ptr) ?: size; object->excess_ref = 0; + object->min_count = min_count; object->count = 0; /* white color initially */ + object->jiffies = jiffies; object->checksum = 0; object->del_state = 0; @@ -676,24 +692,6 @@ static struct kmemleak_object *__alloc_object(gfp_t gfp) /* kernel backtrace */ object->trace_handle = set_track_prepare(); - return object; -} - -static int __link_object(struct kmemleak_object *object, unsigned long ptr, - size_t size, int min_count, bool is_phys) -{ - - struct kmemleak_object *parent; - struct rb_node **link, *rb_parent; - unsigned long untagged_ptr; - unsigned long untagged_objp; - - object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); - object->pointer = ptr; - object->size = kfence_ksize((void *)ptr) ?: size; - object->min_count = min_count; - object->jiffies = jiffies; - untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); /* * Only update min_addr and max_addr with object -- cgit From a259945efe6ada94087ef666e9b38f8e34ea34ba Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 17 Oct 2023 12:31:28 -0400 Subject: mm/migrate: correct nr_failed in migrate_pages_sync() nr_failed was missing the large folio splits from migrate_pages_batch() and can cause a mismatch between migrate_pages() return value and the number of not migrated pages, i.e., when the return value of migrate_pages() is 0, there are still pages left in the from page list. It will happen when a non-PMD THP large folio fails to migrate due to -ENOMEM and is split successfully but not all the split pages are not migrated, migrate_pages_batch() would return non-zero, but astats.nr_thp_split = 0. nr_failed would be 0 and returned to the caller of migrate_pages(), but the not migrated pages are left in the from page list without being added back to LRU lists. Fix it by adding a new nr_split counter for large folio splits and adding it to nr_failed in migrate_page_sync() after migrate_pages_batch() is done. Link: https://lkml.kernel.org/r/20231017163129.2025214-1-zi.yan@sent.com Fixes: 2ef7dbb26990 ("migrate_pages: try migrate in batch asynchronously firstly") Signed-off-by: Zi Yan Acked-by: Huang Ying Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 71fc5064cec6..53244d283c3e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1495,6 +1495,7 @@ struct migrate_pages_stats { int nr_thp_succeeded; /* THP migrated successfully */ int nr_thp_failed; /* THP failed to be migrated */ int nr_thp_split; /* THP split before migrating */ + int nr_split; /* Large folio (include THP) split before migrating */ }; /* @@ -1614,6 +1615,7 @@ static int migrate_pages_batch(struct list_head *from, int nr_retry_pages = 0; int pass = 0; bool is_thp = false; + bool is_large = false; struct folio *folio, *folio2, *dst = NULL, *dst2; int rc, rc_saved = 0, nr_pages; LIST_HEAD(unmap_folios); @@ -1629,7 +1631,8 @@ static int migrate_pages_batch(struct list_head *from, nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { - is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); + is_large = folio_test_large(folio); + is_thp = is_large && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); @@ -1649,6 +1652,7 @@ static int migrate_pages_batch(struct list_head *from, stats->nr_thp_failed++; if (!try_split_folio(folio, split_folios)) { stats->nr_thp_split++; + stats->nr_split++; continue; } stats->nr_failed_pages += nr_pages; @@ -1677,11 +1681,12 @@ static int migrate_pages_batch(struct list_head *from, nr_failed++; stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ - if (folio_test_large(folio) && !nosplit) { + if (is_large && !nosplit) { int ret = try_split_folio(folio, split_folios); if (!ret) { stats->nr_thp_split += is_thp; + stats->nr_split += is_large; break; } else if (reason == MR_LONGTERM_PIN && ret == -EAGAIN) { @@ -1827,6 +1832,7 @@ static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio, stats->nr_succeeded += astats.nr_succeeded; stats->nr_thp_succeeded += astats.nr_thp_succeeded; stats->nr_thp_split += astats.nr_thp_split; + stats->nr_split += astats.nr_split; if (rc < 0) { stats->nr_failed_pages += astats.nr_failed_pages; stats->nr_thp_failed += astats.nr_thp_failed; @@ -1834,7 +1840,11 @@ static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio, return rc; } stats->nr_thp_failed += astats.nr_thp_split; - nr_failed += astats.nr_thp_split; + /* + * Do not count rc, as pages will be retried below. + * Count nr_split only, since it includes nr_thp_split. + */ + nr_failed += astats.nr_split; /* * Fall back to migrate all failed folios one by one synchronously. All * failed folios except split THPs will be retried, so their failure -- cgit From 49cac03a8f0a56cafa5329911564c97c130ced43 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 17 Oct 2023 12:31:29 -0400 Subject: mm/migrate: add nr_split to trace_mm_migrate_pages stats. Add nr_split to trace_mm_migrate_pages for large folio (including THP) split events. [akpm@linux-foundation.org: cleanup per Huang, Ying] Link: https://lkml.kernel.org/r/20231017163129.2025214-2-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: "Huang, Ying" Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 53244d283c3e..125194f5af0f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1686,7 +1686,7 @@ static int migrate_pages_batch(struct list_head *from, if (!ret) { stats->nr_thp_split += is_thp; - stats->nr_split += is_large; + stats->nr_split++; break; } else if (reason == MR_LONGTERM_PIN && ret == -EAGAIN) { @@ -1979,7 +1979,8 @@ out: count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, stats.nr_thp_succeeded, stats.nr_thp_failed, - stats.nr_thp_split, mode, reason); + stats.nr_thp_split, stats.nr_split, mode, + reason); if (ret_succeeded) *ret_succeeded = stats.nr_succeeded; -- cgit From c2baef394af88af19e3945f861145679e92ff3e4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 19 Oct 2023 18:43:54 +0800 Subject: mm: page_alloc: skip memoryless nodes entirely Patch series "handle memoryless nodes more appropriately", v3. Currently, in the process of initialization or offline memory, memoryless nodes will still be built into the fallback list of itself or other nodes. This is not what we expected, so this patch series removes memoryless nodes from the fallback list entirely. This patch (of 2): In find_next_best_node(), we skipped the memoryless nodes when building the zonelists of other normal nodes (N_NORMAL), but did not skip the memoryless node itself when building the zonelist. This will cause it to be traversed at runtime. For example, say we have node0 and node1, node0 is memoryless node, then the fallback order of node0 and node1 as follows: [ 0.153005] Fallback order for Node 0: 0 1 [ 0.153564] Fallback order for Node 1: 1 After this patch, we skip memoryless node0 entirely, then the fallback order of node0 and node1 as follows: [ 0.155236] Fallback order for Node 0: 1 [ 0.155806] Fallback order for Node 1: 1 So it becomes completely invisible, which will reduce runtime overhead. And in this way, we will not try to allocate pages from memoryless node0, then the panic mentioned in [1] will also be fixed. Even though this problem has been solved by dropping the NODE_MIN_SIZE constrain in x86 [2], it would be better to fix it in core MM as well. [1]. https://lore.kernel.org/all/20230212110305.93670-1-zhengqi.arch@bytedance.com/ [2]. https://lore.kernel.org/all/20231017062215.171670-1-rppt@kernel.org/ [zhengqi.arch@bytedance.com: update comment, per Ingo] Link: https://lkml.kernel.org/r/7300fc00a057eefeb9a68c8ad28171c3f0ce66ce.1697799303.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/cover.1697799303.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/cover.1697711415.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/157013e978468241de4a4c05d5337a44638ecb0e.1697711415.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Acked-by: Ingo Molnar Cc: Aneesh Kumar K.V Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d52718284029..aa90758950a7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5020,8 +5020,11 @@ int find_next_best_node(int node, nodemask_t *used_node_mask) int min_val = INT_MAX; int best_node = NUMA_NO_NODE; - /* Use the local node if we haven't already */ - if (!node_isset(node, *used_node_mask)) { + /* + * Use the local node if we haven't already, but for memoryless local + * node, we should skip it and fall back to other nodes. + */ + if (!node_isset(node, *used_node_mask) && node_state(node, N_MEMORY)) { node_set(node, *used_node_mask); return node; } -- cgit From b7812c86c7403283f8b3775ee11c6c01d457016c Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 19 Oct 2023 18:43:55 +0800 Subject: mm: memory_hotplug: drop memoryless node from fallback lists In offline_pages(), if a node becomes memoryless, we will clear its N_MEMORY state by calling node_states_clear_node(). But we do this after rebuilding the zonelists by calling build_all_zonelists(), which will cause this memoryless node to still be in the fallback nodes (node_order[]) of other nodes. To drop memoryless nodes from fallback nodes in this case, just call node_states_clear_node() before calling build_all_zonelists(). In this way, we will not try to allocate pages from memoryless node0, then the panic mentioned in [1] will also be fixed. Even though this problem has been solved by dropping the NODE_MIN_SIZE constrain in x86 [2], it would be better to fix it in the core MM as well. https://lore.kernel.org/all/20230212110305.93670-1-zhengqi.arch@bytedance.com/ [1] https://lore.kernel.org/all/20231017062215.171670-1-rppt@kernel.org/ [2] Link: https://lkml.kernel.org/r/9f1dbe7ee1301c7163b2770e32954ff5e3ecf2c4.1697711415.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Acked-by: Ingo Molnar Cc: Aneesh Kumar K.V Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3b301c4023ff..ab41a511e20a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -2012,12 +2012,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); + /* + * Make sure to mark the node as memory-less before rebuilding the zone + * list. Otherwise this node would still appear in the fallback lists. + */ + node_states_clear_node(node, &arg); if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); } - node_states_clear_node(node, &arg); if (arg.status_change_nid >= 0) { kcompactd_stop(node); kswapd_stop(node); -- cgit From 8dd1e896735f6e5abf66525dfd39bbd7b8c0c6d6 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 20 Oct 2023 11:33:27 -0700 Subject: mm/khugepaged: convert __collapse_huge_page_isolate() to use folios Patch series "Some khugepaged folio conversions", v3. This patchset converts a number of functions to use folios. This cleans up some khugepaged code and removes a large number of hidden compound_head() calls. This patch (of 5): Replaces 11 calls to compound_head() with 1, and removes 1348 bytes of kernel text. Link: https://lkml.kernel.org/r/20231020183331.10770-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20231020183331.10770-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/khugepaged.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 88433cc25d8a..500756604488 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -542,6 +542,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, struct list_head *compound_pagelist) { struct page *page = NULL; + struct folio *folio = NULL; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; bool writable = false; @@ -576,7 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } - VM_BUG_ON_PAGE(!PageAnon(page), page); + folio = page_folio(page); + VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); if (page_mapcount(page) > 1) { ++shared; @@ -588,16 +590,15 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } } - if (PageCompound(page)) { - struct page *p; - page = compound_head(page); + if (folio_test_large(folio)) { + struct folio *f; /* * Check if we have dealt with the compound page * already */ - list_for_each_entry(p, compound_pagelist, lru) { - if (page == p) + list_for_each_entry(f, compound_pagelist, lru) { + if (folio == f) goto next; } } @@ -608,7 +609,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * is needed to serialize against split_huge_page * when invoked from the VM. */ - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { result = SCAN_PAGE_LOCK; goto out; } @@ -624,8 +625,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ - if (!is_refcount_suitable(page)) { - unlock_page(page); + if (!is_refcount_suitable(&folio->page)) { + folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; } @@ -634,27 +635,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ - if (!isolate_lru_page(page)) { - unlock_page(page); + if (!folio_isolate_lru(folio)) { + folio_unlock(folio); result = SCAN_DEL_PAGE_LRU; goto out; } - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_lru(page), - compound_nr(page)); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - if (PageCompound(page)) - list_add_tail(&page->lru, compound_pagelist); + if (folio_test_large(folio)) + list_add_tail(&folio->lru, compound_pagelist); next: /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && - (pte_young(pteval) || page_is_young(page) || - PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + (pte_young(pteval) || folio_test_young(folio) || + folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; @@ -668,13 +669,13 @@ next: result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page, none_or_zero, + trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, referenced, writable, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); - trace_mm_collapse_huge_page_isolate(page, none_or_zero, + trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, referenced, writable, result); return result; } -- cgit From 5c07ebb372d66423e508ecfb8e00324f8797f072 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 20 Oct 2023 11:33:28 -0700 Subject: mm/khugepaged: convert hpage_collapse_scan_pmd() to use folios Replaces 5 calls to compound_head(), and removes 1385 bytes of kernel text. Link: https://lkml.kernel.org/r/20231020183331.10770-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Rik van Riel Reviewed-by: Yang Shi Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 500756604488..6c4b5af43371 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1248,6 +1248,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; + struct folio *folio = NULL; unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; @@ -1334,29 +1335,28 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, } } - page = compound_head(page); - + folio = page_folio(page); /* * Record which node the original page is from and save this * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ - node = page_to_nid(page); + node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } cc->node_load[node]++; - if (!PageLRU(page)) { + if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; goto out_unmap; } - if (PageLocked(page)) { + if (folio_test_locked(folio)) { result = SCAN_PAGE_LOCK; goto out_unmap; } - if (!PageAnon(page)) { + if (!folio_test_anon(folio)) { result = SCAN_PAGE_ANON; goto out_unmap; } @@ -1371,7 +1371,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ - if (!is_refcount_suitable(page)) { + if (!is_refcount_suitable(&folio->page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1381,8 +1381,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && - (pte_young(pteval) || page_is_young(page) || - PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + (pte_young(pteval) || folio_test_young(folio) || + folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; } @@ -1404,7 +1404,7 @@ out_unmap: *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, none_or_zero, result, unmapped); return result; } -- cgit From dbf85c21e4aff90912b5d7755d2b25611f9191e9 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 20 Oct 2023 11:33:29 -0700 Subject: mm/khugepaged: convert is_refcount_suitable() to use folios Both callers of is_refcount_suitable() have been converted to use folios, so convert it to take in a folio. Both callers only operate on head pages of folios so mapcount/refcount conversions here are trivial. Removes 3 calls to compound head, and removes 315 bytes of kernel text. Link: https://lkml.kernel.org/r/20231020183331.10770-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6c4b5af43371..9efd8ff68f06 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -524,15 +524,15 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } } -static bool is_refcount_suitable(struct page *page) +static bool is_refcount_suitable(struct folio *folio) { int expected_refcount; - expected_refcount = total_mapcount(page); - if (PageSwapCache(page)) - expected_refcount += compound_nr(page); + expected_refcount = folio_mapcount(folio); + if (folio_test_swapcache(folio)) + expected_refcount += folio_nr_pages(folio); - return page_count(page) == expected_refcount; + return folio_ref_count(folio) == expected_refcount; } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, @@ -625,7 +625,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ - if (!is_refcount_suitable(&folio->page)) { + if (!is_refcount_suitable(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; @@ -1371,7 +1371,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ - if (!is_refcount_suitable(&folio->page)) { + if (!is_refcount_suitable(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } -- cgit From b455f39d228935f88eebcd1f7c1a6981093c6a3b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 20 Oct 2023 11:33:30 -0700 Subject: mm/khugepaged: convert alloc_charge_hpage() to use folios Also remove count_memcg_page_event now that its last caller no longer uses it and reword hpage_collapse_alloc_page() to hpage_collapse_alloc_folio(). This removes 1 call to compound_head() and helps convert khugepaged to use folios throughout. Link: https://lkml.kernel.org/r/20231020183331.10770-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Rik van Riel Reviewed-by: Yang Shi Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9efd8ff68f06..6a7184cd291b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -888,16 +888,16 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) } #endif -static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node, +static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node, nodemask_t *nmask) { - *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask); - if (unlikely(!*hpage)) { + *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask); + + if (unlikely(!*folio)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); return false; } - folio_prep_large_rmappable((struct folio *)*hpage); count_vm_event(THP_COLLAPSE_ALLOC); return true; } @@ -1064,17 +1064,20 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, int node = hpage_collapse_find_target_node(cc); struct folio *folio; - if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask)) + if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) { + *hpage = NULL; return SCAN_ALLOC_HUGE_PAGE_FAIL; + } - folio = page_folio(*hpage); if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { folio_put(folio); *hpage = NULL; return SCAN_CGROUP_CHARGE_FAIL; } - count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); + count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); + + *hpage = folio_page(folio, 0); return SCAN_SUCCEED; } -- cgit From 98b32d296d95d7aa0516c36b72406277412268cd Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 20 Oct 2023 11:33:31 -0700 Subject: mm/khugepaged: convert collapse_pte_mapped_thp() to use folios This removes 2 calls to compound_head() and helps convert khugepaged to use folios throughout. Previously, if the address passed to collapse_pte_mapped_thp() corresponded to a tail page, the scan would fail immediately. Using filemap_lock_folio() we get the corresponding folio back and try to operate on the folio instead. Link: https://lkml.kernel.org/r/20231020183331.10770-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Rik van Riel Reviewed-by: Yang Shi Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6a7184cd291b..bc2d8ff269c7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1477,7 +1477,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; struct vm_area_struct *vma = vma_lookup(mm, haddr); - struct page *hpage; + struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; @@ -1510,19 +1510,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP; - hpage = find_lock_page(vma->vm_file->f_mapping, + folio = filemap_lock_folio(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); - if (!hpage) + if (IS_ERR(folio)) return SCAN_PAGE_NULL; - if (!PageHead(hpage)) { - result = SCAN_FAIL; - goto drop_hpage; - } - - if (compound_order(hpage) != HPAGE_PMD_ORDER) { + if (folio_order(folio) != HPAGE_PMD_ORDER) { result = SCAN_PAGE_COMPOUND; - goto drop_hpage; + goto drop_folio; } result = find_pmd_or_thp_or_none(mm, haddr, &pmd); @@ -1536,13 +1531,13 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, */ goto maybe_install_pmd; default: - goto drop_hpage; + goto drop_folio; } result = SCAN_FAIL; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ - goto drop_hpage; + goto drop_folio; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; @@ -1567,7 +1562,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage. */ - if (hpage + i != page) + if (folio_page(folio, i) != page) goto abort; } @@ -1582,7 +1577,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * page_table_lock) ptl nests inside pml. The less time we hold pml, * the better; but userfaultfd's mfill_atomic_pte() on a private VMA * inserts a valid as-if-COWed PTE without even looking up page cache. - * So page lock of hpage does not protect from it, so we must not drop + * So page lock of folio does not protect from it, so we must not drop * ptl before pgt_pmd is removed, so uffd private needs pml taken now. */ if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) @@ -1606,7 +1601,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, continue; /* * We dropped ptl after the first scan, to do the mmu_notifier: - * page lock stops more PTEs of the hpage being faulted in, but + * page lock stops more PTEs of the folio being faulted in, but * does not stop write faults COWing anon copies from existing * PTEs; and does not stop those being swapped out or migrated. */ @@ -1615,7 +1610,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto abort; } page = vm_normal_page(vma, addr, ptent); - if (hpage + i != page) + if (folio_page(folio, i) != page) goto abort; /* @@ -1634,8 +1629,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, /* step 3: set proper refcount and mm_counters. */ if (nr_ptes) { - page_ref_sub(hpage, nr_ptes); - add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); + folio_ref_sub(folio, nr_ptes); + add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); } /* step 4: remove empty page table */ @@ -1659,14 +1654,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd - ? set_huge_pmd(vma, haddr, pmd, hpage) + ? set_huge_pmd(vma, haddr, pmd, &folio->page) : SCAN_SUCCEED; - goto drop_hpage; + goto drop_folio; abort: if (nr_ptes) { flush_tlb_mm(mm); - page_ref_sub(hpage, nr_ptes); - add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); + folio_ref_sub(folio, nr_ptes); + add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); } if (start_pte) pte_unmap_unlock(start_pte, ptl); @@ -1674,9 +1669,9 @@ abort: spin_unlock(pml); if (notified) mmu_notifier_invalidate_range_end(&range); -drop_hpage: - unlock_page(hpage); - put_page(hpage); +drop_folio: + folio_unlock(folio); + folio_put(folio); return result; } -- cgit From be16dd764a69752a31096d1a6b2ad775b728b1bd Mon Sep 17 00:00:00 2001 From: Muhammad Muzammil Date: Mon, 23 Oct 2023 17:44:05 +0500 Subject: mm: fix multiple typos in multiple files Link: https://lkml.kernel.org/r/20231023124405.36981-1-m.muzzammilashraf@gmail.com Signed-off-by: Muhammad Muzammil Reviewed-by: Randy Dunlap Cc: "James E.J. Bottomley" Cc: Matthew Wilcox (Oracle) Cc: Muhammad Muzammil Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 4 ++-- mm/internal.h | 2 +- mm/memcontrol.c | 4 ++-- mm/mmap.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 48e329ea5ba3..e651500e597a 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1322,8 +1322,8 @@ static int __init debug_vm_pgtable(void) * true irrespective of the starting protection value for a * given page table entry. * - * Protection based vm_flags combinatins are always linear - * and increasing i.e starting from VM_NONE and going upto + * Protection based vm_flags combinations are always linear + * and increasing i.e starting from VM_NONE and going up to * (VM_SHARED | READ | WRITE | EXEC). */ #define VM_FLAGS_START (VM_NONE) diff --git a/mm/internal.h b/mm/internal.h index c61a98d3b3c7..3eceae1ec4c0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -592,7 +592,7 @@ extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, * range. * "fully mapped" means all the pages of folio is associated with the page * table of range while this function just check whether the folio range is - * within the range [start, end). Funcation caller nees to do page table + * within the range [start, end). Function caller needs to do page table * check if it cares about the page table association. * * Typical usage (like mlock or madvise) is: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8b0859b8cc03..774bd6e21e27 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -819,7 +819,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, memcg = pn->memcg; /* - * The caller from rmap relay on disabled preemption becase they never + * The caller from rmap relies on disabled preemption because they never * update their counter from in-interrupt context. For these two * counters we check that the update is never performed from an * interrupt context while other caller need to have disabled interrupt. @@ -8044,7 +8044,7 @@ static struct cftype memsw_files[] = { * * This doesn't check for specific headroom, and it is not atomic * either. But with zswap, the size of the allocation is only known - * once compression has occured, and this optimistic pre-check avoids + * once compression has occurred, and this optimistic pre-check avoids * spending cycles on compression when there is already no room left * or zswap is disabled altogether somewhere in the hierarchy. */ diff --git a/mm/mmap.c b/mm/mmap.c index 8b57e42fd980..984804d77ae1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1223,7 +1223,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * Does the application expect PROT_READ to imply PROT_EXEC? * * (the exception is when the underlying filesystem is noexec - * mounted, in which case we dont add PROT_EXEC.) + * mounted, in which case we don't add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) if (!(file && path_noexec(&file->f_path))) -- cgit From 76f26535d1446373d4735a252ea4247c39d64ba6 Mon Sep 17 00:00:00 2001 From: Hyesoo Yu Date: Mon, 23 Oct 2023 17:32:16 +0900 Subject: mm: page_alloc: check the order of compound page even when the order is zero For compound pages, the head sets the PG_head flag and the tail sets the compound_head to indicate the head page. If a user allocates a compound page and frees it with a different order, the compound page information will not be properly initialized. To detect this problem, compound_order(page) and the order argument are compared, but this is not checked when the order argument is zero. That error should be checked regardless of the order. Link: https://lkml.kernel.org/r/20231023083217.1866451-1-hyesoo.yu@samsung.com Signed-off-by: Hyesoo Yu Reviewed-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aa90758950a7..5f46c7f85cb1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1079,6 +1079,7 @@ static __always_inline bool free_pages_prepare(struct page *page, int bad = 0; bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); bool init = want_init_on_free(); + bool compound = PageCompound(page); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1097,16 +1098,15 @@ static __always_inline bool free_pages_prepare(struct page *page, return false; } + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + /* * Check tail pages before head page information is cleared to * avoid checking PageCompound for order-0 pages. */ if (unlikely(order)) { - bool compound = PageCompound(page); int i; - VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - if (compound) page[1].flags &= ~PAGE_FLAGS_SECOND; for (i = 1; i < (1 << order); i++) { -- cgit From e5b16c862884a62c09ab60cbfc6c7b544670bf9e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 22 Oct 2023 20:56:19 +0200 Subject: mm: hugetlb_vmemmap: fix reference to nonexistent file The directory this file is in was renamed but the reference didn't get updated. Fix it. Link: https://lkml.kernel.org/r/20231022185619.919397-1-vegard.nossum@oracle.com Fixes: ee65728e103b ("docs: rename Documentation/vm to Documentation/mm") Signed-off-by: Vegard Nossum Acked-by: Mike Rapoport (IBM) Reviewed-by: Muchun Song Reviewed-by: Rik van Riel Acked-by: Mike Kravetz Cc: Matthew Wilcox Cc: Ira Weiny Cc: Jonathan Corbet Cc: Wu XiangCheng Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 9fd0cb270502..2fcae92d3359 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -12,7 +12,7 @@ /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See - * Documentation/vm/vmemmap_dedup.rst. + * Documentation/mm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) -- cgit From eebb3dabbb5cc590afe32880b5d3726d0fbf88db Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Oct 2023 12:33:22 +0800 Subject: mm: migrate: record the mlocked page status to remove unnecessary lru drain When doing compaction, I found the lru_add_drain() is an obvious hotspot when migrating pages. The distribution of this hotspot is as follows: - 18.75% compact_zone - 17.39% migrate_pages - 13.79% migrate_pages_batch - 11.66% migrate_folio_move - 7.02% lru_add_drain + 7.02% lru_add_drain_cpu + 3.00% move_to_new_folio 1.23% rmap_walk + 1.92% migrate_folio_unmap + 3.20% migrate_pages_sync + 0.90% isolate_migratepages The lru_add_drain() was added by commit c3096e6782b7 ("mm/migrate: __unmap_and_move() push good newpage to LRU") to drain the newpage to LRU immediately, to help to build up the correct newpage->mlock_count in remove_migration_ptes() for mlocked pages. However, if there are no mlocked pages are migrating, then we can avoid this lru drain operation, especailly for the heavy concurrent scenarios. So we can record the source pages' mlocked status in migrate_folio_unmap(), and only drain the lru list when the mlocked status is set in migrate_folio_move(). In addition, the page was already isolated from lru when migrating, so checking the mlocked status is stable by folio_test_mlocked() in migrate_folio_unmap(). After this patch, I can see the hotpot of the lru_add_drain() is gone: - 9.41% migrate_pages_batch - 6.15% migrate_folio_move - 3.64% move_to_new_folio + 1.80% migrate_folio_extra + 1.70% buffer_migrate_folio + 1.41% rmap_walk + 0.62% folio_add_lru + 3.07% migrate_folio_unmap Meanwhile, the compaction latency shows some improvements when running thpscale: base patched Amean fault-both-1 1131.22 ( 0.00%) 1112.55 * 1.65%* Amean fault-both-3 2489.75 ( 0.00%) 2324.15 * 6.65%* Amean fault-both-5 3257.37 ( 0.00%) 3183.18 * 2.28%* Amean fault-both-7 4257.99 ( 0.00%) 4079.04 * 4.20%* Amean fault-both-12 6614.02 ( 0.00%) 6075.60 * 8.14%* Amean fault-both-18 10607.78 ( 0.00%) 8978.86 * 15.36%* Amean fault-both-24 14911.65 ( 0.00%) 11619.55 * 22.08%* Amean fault-both-30 14954.67 ( 0.00%) 14925.66 * 0.19%* Amean fault-both-32 16654.87 ( 0.00%) 15580.31 * 6.45%* Link: https://lkml.kernel.org/r/06e9153a7a4850352ec36602df3a3a844de45698.1697859741.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: "Huang, Ying" Reviewed-by: Zi Yan Cc: Hugh Dickins Cc: Mel Gorman Cc: Vlastimil Babka Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/migrate.c | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 125194f5af0f..35a88334bb3c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1027,22 +1027,28 @@ union migration_ptr { struct anon_vma *anon_vma; struct address_space *mapping; }; + +enum { + PAGE_WAS_MAPPED = BIT(0), + PAGE_WAS_MLOCKED = BIT(1), +}; + static void __migrate_folio_record(struct folio *dst, - unsigned long page_was_mapped, + unsigned long old_page_state, struct anon_vma *anon_vma) { union migration_ptr ptr = { .anon_vma = anon_vma }; dst->mapping = ptr.mapping; - dst->private = (void *)page_was_mapped; + dst->private = (void *)old_page_state; } static void __migrate_folio_extract(struct folio *dst, - int *page_was_mappedp, + int *old_page_state, struct anon_vma **anon_vmap) { union migration_ptr ptr = { .mapping = dst->mapping }; *anon_vmap = ptr.anon_vma; - *page_was_mappedp = (unsigned long)dst->private; + *old_page_state = (unsigned long)dst->private; dst->mapping = NULL; dst->private = NULL; } @@ -1103,7 +1109,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, { struct folio *dst; int rc = -EAGAIN; - int page_was_mapped = 0; + int old_page_state = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__folio_test_movable(src); bool locked = false; @@ -1157,6 +1163,8 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, folio_lock(src); } locked = true; + if (folio_test_mlocked(src)) + old_page_state |= PAGE_WAS_MLOCKED; if (folio_test_writeback(src)) { /* @@ -1206,7 +1214,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, dst_locked = true; if (unlikely(!is_lru)) { - __migrate_folio_record(dst, page_was_mapped, anon_vma); + __migrate_folio_record(dst, old_page_state, anon_vma); return MIGRATEPAGE_UNMAP; } @@ -1232,11 +1240,11 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0); - page_was_mapped = 1; + old_page_state |= PAGE_WAS_MAPPED; } if (!folio_mapped(src)) { - __migrate_folio_record(dst, page_was_mapped, anon_vma); + __migrate_folio_record(dst, old_page_state, anon_vma); return MIGRATEPAGE_UNMAP; } @@ -1248,7 +1256,8 @@ out: if (rc == -EAGAIN) ret = NULL; - migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret); + migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, + anon_vma, locked, ret); migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); return rc; @@ -1261,12 +1270,12 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, struct list_head *ret) { int rc; - int page_was_mapped = 0; + int old_page_state = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__folio_test_movable(src); struct list_head *prev; - __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); + __migrate_folio_extract(dst, &old_page_state, &anon_vma); prev = dst->lru.prev; list_del(&dst->lru); @@ -1287,10 +1296,10 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, * isolated from the unevictable LRU: but this case is the easiest. */ folio_add_lru(dst); - if (page_was_mapped) + if (old_page_state & PAGE_WAS_MLOCKED) lru_add_drain(); - if (page_was_mapped) + if (old_page_state & PAGE_WAS_MAPPED) remove_migration_ptes(src, dst, false); out_unlock_both: @@ -1322,11 +1331,12 @@ out: */ if (rc == -EAGAIN) { list_add(&dst->lru, prev); - __migrate_folio_record(dst, page_was_mapped, anon_vma); + __migrate_folio_record(dst, old_page_state, anon_vma); return rc; } - migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret); + migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, + anon_vma, true, ret); migrate_folio_undo_dst(dst, true, put_new_folio, private); return rc; @@ -1799,12 +1809,12 @@ out: dst = list_first_entry(&dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { - int page_was_mapped = 0; + int old_page_state = 0; struct anon_vma *anon_vma = NULL; - __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); - migrate_folio_undo_src(folio, page_was_mapped, anon_vma, - true, ret_folios); + __migrate_folio_extract(dst, &old_page_state, &anon_vma); + migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, + anon_vma, true, ret_folios); list_del(&dst->lru); migrate_folio_undo_dst(dst, true, put_new_folio, private); dst = dst2; -- cgit From b1454b463c217e5bc553acc44b2389d9257c9708 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 23 Oct 2023 23:38:41 -0700 Subject: mm: mlock: avoid folio_within_range() on KSM pages Since commit dc68badcede4 ("mm: mlock: update mlock_pte_range to handle large folio") I've just occasionally seen VM_WARN_ON_FOLIO(folio_test_ksm) warnings from folio_within_range(), in a splurge after testing with KSM hyperactive. folio_referenced_one()'s use of folio_within_vma() is safe because it checks folio_test_large() first; but allow_mlock_munlock() needs to do the same to avoid those warnings (or check !folio_test_ksm() itself? Or move either check into folio_within_range()? Hard to tell without more examples of its use). Link: https://lkml.kernel.org/r/23852f6a-5bfa-1ffd-30db-30c5560ad426@google.com Fixes: dc68badcede4 ("mm: mlock: update mlock_pte_range to handle large folio") Signed-off-by: Hugh Dickins Reviewed-by: Yin Fengwei Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Stefan Roesch Signed-off-by: Andrew Morton --- mm/mlock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index aa44456200e3..086546ac5766 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -346,6 +346,10 @@ static inline bool allow_mlock_munlock(struct folio *folio, if (!(vma->vm_flags & VM_LOCKED)) return true; + /* folio_within_range() cannot take KSM, but any small folio is OK */ + if (!folio_test_large(folio)) + return true; + /* folio not in range [start, end), skip mlock */ if (!folio_within_range(folio, vma, start, end)) return false; -- cgit From d35963bfb05877455228ecec6b194f624489f96a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Oct 2023 19:49:21 +0000 Subject: mm/damon/core: avoid divide-by-zero during monitoring results update When monitoring attributes are changed, DAMON updates access rate of the monitoring results accordingly. For that, it divides some values by the maximum nr_accesses. However, due to the type of the related variables, simple division-based calculation of the divisor can return zero. As a result, divide-by-zero is possible. Fix it by using damon_max_nr_accesses(), which handles the case. Link: https://lkml.kernel.org/r/20231019194924.100347-3-sj@kernel.org Fixes: 2f5bef5a590b ("mm/damon/core: update monitoring results for new monitoring attributes") Signed-off-by: SeongJae Park Reported-by: Jakub Acs Cc: [6.3+] Signed-off-by: Andrew Morton --- mm/damon/core.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index 9f4f7c378cf3..e194c8075235 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -500,20 +500,14 @@ static unsigned int damon_age_for_new_attrs(unsigned int age, static unsigned int damon_accesses_bp_to_nr_accesses( unsigned int accesses_bp, struct damon_attrs *attrs) { - unsigned int max_nr_accesses = - attrs->aggr_interval / attrs->sample_interval; - - return accesses_bp * max_nr_accesses / 10000; + return accesses_bp * damon_max_nr_accesses(attrs) / 10000; } /* convert nr_accesses to access ratio in bp (per 10,000) */ static unsigned int damon_nr_accesses_to_accesses_bp( unsigned int nr_accesses, struct damon_attrs *attrs) { - unsigned int max_nr_accesses = - attrs->aggr_interval / attrs->sample_interval; - - return nr_accesses * 10000 / max_nr_accesses; + return nr_accesses * 10000 / damon_max_nr_accesses(attrs); } static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses, -- cgit From 3bafc47d3c4a2fc4d3b382aeb3c087f8fc84d9fd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Oct 2023 19:49:22 +0000 Subject: mm/damon/ops-common: avoid divide-by-zero during region hotness calculation When calculating the hotness of each region for the under-quota regions prioritization, DAMON divides some values by the maximum nr_accesses. However, due to the type of the related variables, simple division-based calculation of the divisor can return zero. As a result, divide-by-zero is possible. Fix it by using damon_max_nr_accesses(), which handles the case. Link: https://lkml.kernel.org/r/20231019194924.100347-4-sj@kernel.org Fixes: 198f0f4c58b9 ("mm/damon/vaddr,paddr: support pageout prioritization") Signed-off-by: SeongJae Park Reported-by: Jakub Acs Cc: [5.16+] Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index ac1c3fa80f98..d25d99cb5f2b 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -73,7 +73,6 @@ void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { - unsigned int max_nr_accesses; int freq_subscore; unsigned int age_in_sec; int age_in_log, age_subscore; @@ -81,8 +80,8 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, unsigned int age_weight = s->quota.weight_age; int hotness; - max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval; - freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; + freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / + damon_max_nr_accesses(&c->attrs); age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; -- cgit From 44063f125af4bb4efd1d500d8091fa33a98af325 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Oct 2023 19:49:23 +0000 Subject: mm/damon/lru_sort: avoid divide-by-zero in hot threshold calculation When calculating the hotness threshold for lru_prio scheme of DAMON_LRU_SORT, the module divides some values by the maximum nr_accesses. However, due to the type of the related variables, simple division-based calculation of the divisor can return zero. As a result, divide-by-zero is possible. Fix it by using damon_max_nr_accesses(), which handles the case. Link: https://lkml.kernel.org/r/20231019194924.100347-5-sj@kernel.org Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting") Signed-off-by: SeongJae Park Reported-by: Jakub Acs Cc: [6.0+] Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 3ecdcc029443..f2e5f9431892 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -195,9 +195,7 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - /* aggr_interval / sample_interval is the maximum nr_accesses */ - hot_thres = damon_lru_sort_mon_attrs.aggr_interval / - damon_lru_sort_mon_attrs.sample_interval * + hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) * hot_thres_access_freq / 1000; scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) -- cgit From 62f76a7b53bfa2ecfe1570a5b1d0d574c576a56d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Oct 2023 19:49:24 +0000 Subject: mm/damon/core: avoid divide-by-zero from pseudo-moving window length calculation When calculating the pseudo-moving access rate, DAMON divides some values by the maximum nr_accesses. However, due to the type of the related variables, simple division-based calculation of the divisor can return zero. As a result, divide-by-zero is possible. Fix it by using damon_max_nr_accesses(), which handles the case. Note that this is a fix for a commit that not in the mainline but mm tree. Link: https://lkml.kernel.org/r/20231019194924.100347-6-sj@kernel.org Fixes: ace30fb21af5 ("mm/damon/core: use pseudo-moving sum for nr_accesses_bp") Reported-by: Jakub Acs Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index e194c8075235..aa2dc7087cd9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1665,7 +1665,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, * aggr_interval, owing to validation of damon_set_attrs(). */ if (attrs->sample_interval) - len_window = attrs->aggr_interval / attrs->sample_interval; + len_window = damon_max_nr_accesses(attrs); r->nr_accesses_bp = damon_moving_sum(r->nr_accesses_bp, r->last_nr_accesses * 10000, len_window, accessed ? 10000 : 0); -- cgit From b8ee5575f763c239902f8523d82103a45c153b29 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Oct 2023 21:07:34 +0000 Subject: mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets() damon_sysfs_set_targets() had a bug that can result in unexpected memory usage and monitoring overhead increase. The bug has fixed by a previous commit. Add a unit test for avoiding a similar bug of future. Link: https://lkml.kernel.org/r/20231022210735.46409-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 12 +++++++ mm/damon/sysfs-test.h | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++ mm/damon/sysfs.c | 2 ++ 3 files changed, 100 insertions(+) create mode 100644 mm/damon/sysfs-test.h (limited to 'mm') diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 436c6b4cb5ec..29f43fbc2eff 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -59,6 +59,18 @@ config DAMON_SYSFS This builds the sysfs interface for DAMON. The user space can use the interface for arbitrary data access monitoring. +config DAMON_SYSFS_KUNIT_TEST + bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS + depends on DAMON_SYSFS && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON sysfs interface Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + config DAMON_DBGFS bool "DAMON debugfs interface (DEPRECATED!)" depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS diff --git a/mm/damon/sysfs-test.h b/mm/damon/sysfs-test.h new file mode 100644 index 000000000000..73bdce2452c1 --- /dev/null +++ b/mm/damon/sysfs-test.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_SYSFS_KUNIT_TEST + +#ifndef _DAMON_SYSFS_TEST_H +#define _DAMON_SYSFS_TEST_H + +#include + +static unsigned int nr_damon_targets(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_targets = 0; + + damon_for_each_target(t, ctx) + nr_targets++; + + return nr_targets; +} + +static int __damon_sysfs_test_get_any_pid(int min, int max) +{ + struct pid *pid; + int i; + + for (i = min; i <= max; i++) { + pid = find_get_pid(i); + if (pid) { + put_pid(pid); + return i; + } + } + return -1; +} + +static void damon_sysfs_test_set_targets(struct kunit *test) +{ + struct damon_sysfs_targets *sysfs_targets; + struct damon_sysfs_target *sysfs_target; + struct damon_ctx *ctx; + + sysfs_targets = damon_sysfs_targets_alloc(); + sysfs_targets->nr = 1; + sysfs_targets->targets_arr = kmalloc_array(1, + sizeof(*sysfs_targets->targets_arr), GFP_KERNEL); + + sysfs_target = damon_sysfs_target_alloc(); + sysfs_target->pid = __damon_sysfs_test_get_any_pid(12, 100); + sysfs_target->regions = damon_sysfs_regions_alloc(); + sysfs_targets->targets_arr[0] = sysfs_target; + + ctx = damon_new_ctx(); + + damon_sysfs_set_targets(ctx, sysfs_targets); + KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(ctx)); + + sysfs_target->pid = __damon_sysfs_test_get_any_pid( + sysfs_target->pid + 1, 200); + damon_sysfs_set_targets(ctx, sysfs_targets); + KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(ctx)); + + damon_destroy_ctx(ctx); + kfree(sysfs_targets->targets_arr); + kfree(sysfs_targets); + kfree(sysfs_target); +} + +static struct kunit_case damon_sysfs_test_cases[] = { + KUNIT_CASE(damon_sysfs_test_set_targets), + {}, +}; + +static struct kunit_suite damon_sysfs_test_suite = { + .name = "damon-sysfs", + .test_cases = damon_sysfs_test_cases, +}; +kunit_test_suite(damon_sysfs_test_suite); + +#endif /* _DAMON_SYSFS_TEST_H */ + +#endif /* CONFIG_DAMON_SYSFS_KUNIT_TEST */ diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index f73dc88d2d19..5846970cedce 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1836,3 +1836,5 @@ out: return err; } subsys_initcall(damon_sysfs_init); + +#include "sysfs-test.h" -- cgit From 1cb5d11a370f661c5d0d888bb0cfc2cdc5791382 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:17:43 -0700 Subject: mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: "Huang, Ying" Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 338 +++++++++++++++++++++++++++------------------------------ 1 file changed, 159 insertions(+), 179 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8bb5b4b4b395..b6d82c8bc479 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -111,7 +111,8 @@ /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ -#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; @@ -416,9 +417,19 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { }, }; -static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); +static bool strictly_unmovable(unsigned long flags) +{ + /* + * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO + * if any misplaced page is found. + */ + return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == + MPOL_MF_STRICT; +} + struct queue_pages { struct list_head *pagelist; unsigned long flags; @@ -426,7 +437,8 @@ struct queue_pages { unsigned long start; unsigned long end; struct vm_area_struct *first; - bool has_unmovable; + struct folio *large; /* note last large folio encountered */ + long nr_failed; /* could not be isolated at this time */ }; /* @@ -444,61 +456,37 @@ static inline bool queue_folio_required(struct folio *folio, return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } -/* - * queue_folios_pmd() has three possible return values: - * 0 - folios are placed on the right node or queued successfully, or - * special page is met, i.e. zero page, or unmovable page is found - * but continue walking (indicated by queue_pages.has_unmovable). - * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an - * existing folio was already on a node that does not follow the - * policy. - */ -static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, - unsigned long end, struct mm_walk *walk) - __releases(ptl) +static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { - int ret = 0; struct folio *folio; struct queue_pages *qp = walk->private; - unsigned long flags; if (unlikely(is_pmd_migration_entry(*pmd))) { - ret = -EIO; - goto unlock; + qp->nr_failed++; + return; } folio = pfn_folio(pmd_pfn(*pmd)); if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; - goto unlock; + return; } if (!queue_folio_required(folio, qp)) - goto unlock; - - flags = qp->flags; - /* go to folio migration */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - if (!vma_migratable(walk->vma) || - migrate_folio_add(folio, qp->pagelist, flags)) { - qp->has_unmovable = true; - goto unlock; - } - } else - ret = -EIO; -unlock: - spin_unlock(ptl); - return ret; + return; + if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(walk->vma) || + !migrate_folio_add(folio, qp->pagelist, qp->flags)) + qp->nr_failed++; } /* - * Scan through pages checking if pages follow certain conditions, - * and move them to the pagelist if they do. + * Scan through folios, checking if they satisfy the required conditions, + * moving them from LRU to local pagelist for migration if they do (or not). * - * queue_folios_pte_range() has three possible return values: - * 0 - folios are placed on the right node or queued successfully, or - * special page is met, i.e. zero page, or unmovable page is found - * but continue walking (indicated by queue_pages.has_unmovable). - * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already - * on a node that does not follow the policy. + * queue_folios_pte_range() has two possible return values: + * 0 - continue walking to scan for more, even if an existing folio on the + * wrong node could not be isolated and queued for migration. + * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, + * and an existing folio was on a node that does not follow the policy. */ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -512,8 +500,11 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); - if (ptl) - return queue_folios_pmd(pmd, ptl, addr, end, walk); + if (ptl) { + queue_folios_pmd(pmd, walk); + spin_unlock(ptl); + goto out; + } mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { @@ -522,8 +513,13 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); - if (!pte_present(ptent)) + if (pte_none(ptent)) continue; + if (!pte_present(ptent)) { + if (is_migration_entry(pte_to_swp_entry(ptent))) + qp->nr_failed++; + continue; + } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; @@ -535,95 +531,87 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, continue; if (!queue_folio_required(folio, qp)) continue; - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - /* - * MPOL_MF_STRICT must be specified if we get here. - * Continue walking vmas due to MPOL_MF_MOVE* flags. - */ - if (!vma_migratable(vma)) - qp->has_unmovable = true; - + if (folio_test_large(folio)) { /* - * Do not abort immediately since there may be - * temporary off LRU pages in the range. Still - * need migrate other LRU pages. + * A large folio can only be isolated from LRU once, + * but may be mapped by many PTEs (and Copy-On-Write may + * intersperse PTEs of other, order 0, folios). This is + * a common case, so don't mistake it for failure (but + * there can be other cases of multi-mapped pages which + * this quick check does not help to filter out - and a + * search of the pagelist might grow to be prohibitive). + * + * migrate_pages(&pagelist) returns nr_failed folios, so + * check "large" now so that queue_pages_range() returns + * a comparable nr_failed folios. This does imply that + * if folio could not be isolated for some racy reason + * at its first PTE, later PTEs will not give it another + * chance of isolation; but keeps the accounting simple. */ - if (migrate_folio_add(folio, qp->pagelist, flags)) - qp->has_unmovable = true; - } else - break; + if (folio == qp->large) + continue; + qp->large = folio; + } + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(vma) || + !migrate_folio_add(folio, qp->pagelist, flags)) { + qp->nr_failed++; + if (strictly_unmovable(flags)) + break; + } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); - - return addr != end ? -EIO : 0; +out: + if (qp->nr_failed && strictly_unmovable(flags)) + return -EIO; + return 0; } static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - int ret = 0; #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; - unsigned long flags = (qp->flags & MPOL_MF_VALID); + unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(pte); - if (!pte_present(entry)) + if (!pte_present(entry)) { + if (unlikely(is_hugetlb_entry_migration(entry))) + qp->nr_failed++; goto unlock; + } folio = pfn_folio(pte_pfn(entry)); if (!queue_folio_required(folio, qp)) goto unlock; - - if (flags == MPOL_MF_STRICT) { - /* - * STRICT alone means only detecting misplaced folio and no - * need to further check other vma. - */ - ret = -EIO; + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(walk->vma)) { + qp->nr_failed++; goto unlock; } - - if (!vma_migratable(walk->vma)) { - /* - * Must be STRICT with MOVE*, otherwise .test_walk() have - * stopped walking current vma. - * Detecting misplaced folio but allow migrating folios which - * have been queued. - */ - qp->has_unmovable = true; - goto unlock; - } - /* - * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it - * is shared it is likely not worth migrating. + * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. + * Choosing not to migrate a shared folio is not counted as a failure. * * To check if the folio is shared, ideally we want to make sure * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * expensive, so check the estimated sharers of the folio instead. */ - if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && - !hugetlb_pmd_shared(pte))) { - if (!isolate_hugetlb(folio, qp->pagelist) && - (flags & MPOL_MF_STRICT)) - /* - * Failed to isolate folio but allow migrating pages - * which have been queued. - */ - qp->has_unmovable = true; - } + if ((flags & MPOL_MF_MOVE_ALL) || + (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) + if (!isolate_hugetlb(folio, qp->pagelist)) + qp->nr_failed++; unlock: spin_unlock(ptl); -#else - BUG(); + if (qp->nr_failed && strictly_unmovable(flags)) + return -EIO; #endif - return ret; + return 0; } #ifdef CONFIG_NUMA_BALANCING @@ -704,8 +692,11 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, return 1; } - /* queue pages from current vma */ - if (flags & MPOL_MF_VALID) + /* + * Check page nodes, and queue pages to move, in the current vma. + * But if no moving, and no strict checking, the scan can be skipped. + */ + if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } @@ -727,22 +718,21 @@ static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { /* * Walk through page tables and collect pages to be migrated. * - * If pages found in a given range are on a set of nodes (determined by - * @nodes and @flags,) it's isolated and queued to the pagelist which is - * passed via @private. + * If pages found in a given range are not on the required set of @nodes, + * and migration is allowed, they are isolated and queued to @pagelist. * - * queue_pages_range() has three possible return values: - * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were - * specified. - * 0 - queue pages successfully or no misplaced page. - * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or - * memory range specified by nodemask and maxnode points outside - * your accessible address space (-EFAULT) + * queue_pages_range() may return: + * 0 - all pages already on the right node, or successfully queued for moving + * (or neither strict checking nor moving requested: only range checking). + * >0 - this number of misplaced folios could not be queued for moving + * (a hugetlbfs page or a transparent huge page being counted as 1). + * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. + * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ -static int +static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist, bool lock_vma) + struct list_head *pagelist) { int err; struct queue_pages qp = { @@ -752,20 +742,17 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .start = start, .end = end, .first = NULL, - .has_unmovable = false, }; - const struct mm_walk_ops *ops = lock_vma ? + const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); - if (qp.has_unmovable) - err = 1; if (!qp.first) /* whole range in hole */ err = -EFAULT; - return err; + return err ? : qp.nr_failed; } /* @@ -1008,16 +995,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } #ifdef CONFIG_MIGRATION -static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* - * We try to migrate only unshared folios. If it is shared it - * is likely not worth migrating. + * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. + * Choosing not to migrate a shared folio is not counted as a failure. * * To check if the folio is shared, ideally we want to make sure * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * expensive, so check the estimated sharers of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { if (folio_isolate_lru(folio)) { @@ -1025,32 +1012,31 @@ static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); - } else if (flags & MPOL_MF_STRICT) { + } else { /* * Non-movable folio may reach here. And, there may be * temporary off LRU folios or non-LRU movable folios. * Treat them as unmovable folios since they can't be - * isolated, so they can't be moved at the moment. It - * should return -EIO for this case too. + * isolated, so they can't be moved at the moment. */ - return -EIO; + return false; } } - - return 0; + return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ -static int migrate_to_node(struct mm_struct *mm, int source, int dest, - int flags) +static long migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); - int err = 0; + long nr_failed; + long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, @@ -1059,23 +1045,27 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodes_clear(nmask); node_set(source, nmask); + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + vma = find_vma(mm, 0); + /* - * This does not "check" the range but isolates all pages that + * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address - * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, + * but passes back the count of pages which could not be isolated. */ - vma = find_vma(mm, 0); - VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, - flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); + nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } + if (err >= 0) + err += nr_failed; return err; } @@ -1088,8 +1078,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { - int busy = 0; - int err = 0; + long nr_failed = 0; + long err = 0; nodemask_t tmp; lru_cache_disable(); @@ -1171,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) - busy += err; + nr_failed += err; if (err < 0) break; } @@ -1180,8 +1170,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, lru_cache_enable(); if (err < 0) return err; - return busy; - + return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* @@ -1220,10 +1209,10 @@ static struct folio *new_folio(struct folio *src, unsigned long start) } #else -static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { - return -EIO; + return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, @@ -1247,8 +1236,8 @@ static long do_mbind(unsigned long start, unsigned long len, struct vma_iterator vmi; struct mempolicy *new; unsigned long end; - int err; - int ret; + long err; + long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) @@ -1288,10 +1277,8 @@ static long do_mbind(unsigned long start, unsigned long len, start, start + len, mode, mode_flags, nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); - } { NODEMASK_SCRATCH(scratch); if (scratch) { @@ -1307,44 +1294,37 @@ static long do_mbind(unsigned long start, unsigned long len, goto mpol_out; /* - * Lock the VMAs before scanning for pages to migrate, to ensure we don't - * miss a concurrently inserted page. + * Lock the VMAs before scanning for pages to migrate, + * to ensure we don't miss a concurrently inserted page. */ - ret = queue_pages_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist, true); + nr_failed = queue_pages_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); - if (ret < 0) { - err = ret; - goto up_out; - } - - vma_iter_init(&vmi, mm, start); - prev = vma_prev(&vmi); - for_each_vma_range(vmi, vma, end) { - err = mbind_range(&vmi, vma, &prev, start, end, new); - if (err) - break; + if (nr_failed < 0) { + err = nr_failed; + } else { + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } } if (!err) { - int nr_failed = 0; - if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_folio, NULL, + nr_failed |= migrate_pages(&pagelist, new_folio, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); - if (nr_failed) - putback_movable_pages(&pagelist); } - - if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT)) + if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; - } else { -up_out: - if (!list_empty(&pagelist)) - putback_movable_pages(&pagelist); } + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); + mmap_write_unlock(mm); mpol_out: mpol_put(new); -- cgit From 7f1ee4e2070883d18a431c761db8bb30a958b654 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:19:00 -0700 Subject: mempolicy trivia: delete those ancient pr_debug()s Delete those ancient pr_debug()s - PDprintk()s in Andi Kleen's original submission of core NUMA API, and useful when debugging shared mempolicy lifetime back then, but not used recently. Link: https://lkml.kernel.org/r/f25135-ffb2-40d8-9577-720772b333@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b6d82c8bc479..02a9b89c10aa 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -264,9 +264,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, { struct mempolicy *policy; - pr_debug("setting mode %d flags %d nodes[0] %lx\n", - mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); - if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); @@ -768,11 +765,6 @@ static int vma_replace_policy(struct vm_area_struct *vma, vma_assert_write_locked(vma); - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); - new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); @@ -1273,10 +1265,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (!new) flags |= MPOL_MF_DISCONTIG_OK; - pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", - start, start + len, mode, mode_flags, - nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); { @@ -2496,8 +2484,6 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); - pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, - new->policy ? new->policy->mode : 0); } /* Find shared policy intersecting idx */ @@ -2636,7 +2622,6 @@ void mpol_put_task_policy(struct task_struct *task) static void sp_delete(struct shared_policy *sp, struct sp_node *n) { - pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); sp_free(n); } @@ -2793,12 +2778,6 @@ int mpol_set_shared_policy(struct shared_policy *info, struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", - vma->vm_pgoff, - sz, npol ? npol->mode : -1, - npol ? npol->flags : -1, - npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE); - if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); if (!new) -- cgit From c36f6e6dff4d32ec8b6da8f553933727a57a7a4a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:20:14 -0700 Subject: mempolicy trivia: slightly more consistent naming Before getting down to work, do a little cleanup, mainly of inconsistent variable naming. I gave up trying to rationalize mpol versus pol versus policy, and node versus nid, but let's avoid p and nd. Remove a few superfluous blank lines, but add one; and here prefer vma->vm_policy to vma_policy(vma) - the latter being appropriate in other sources, which have to allow for !CONFIG_NUMA. That intriguing line about KERNEL_DS? should have gone in v2.6.15, when numa_policy_init() stopped using set_mempolicy(2)'s system call handler. Link: https://lkml.kernel.org/r/68287974-b6ae-7df-4ba-d19ddd69cbf@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 73 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 33 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02a9b89c10aa..50a57f4c8013 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -25,7 +25,7 @@ * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * - * preferred Try a specific node first before normal fallback. + * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default @@ -52,7 +52,7 @@ * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * - * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ @@ -291,6 +291,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); @@ -303,11 +304,11 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, } /* Slow path of a mpol destructor. */ -void __mpol_put(struct mempolicy *p) +void __mpol_put(struct mempolicy *pol) { - if (!atomic_dec_and_test(&p->refcnt)) + if (!atomic_dec_and_test(&pol->refcnt)) return; - kmem_cache_free(policy_cache, p); + kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) @@ -364,7 +365,6 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) * * Called with task's alloc_lock held. */ - void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); @@ -375,7 +375,6 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ - void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; @@ -757,7 +756,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, - struct mempolicy *pol) + struct mempolicy *pol) { int err; struct mempolicy *old; @@ -800,7 +799,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, vmstart = vma->vm_start; } - if (mpol_equal(vma_policy(vma), new_pol)) { + if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } @@ -855,18 +854,18 @@ out: * * Called with task's alloc_lock held */ -static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) +static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); - if (p == &default_policy) + if (pol == &default_policy) return; - switch (p->mode) { + switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: - *nodes = p->nodes; + *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ @@ -1634,7 +1633,6 @@ out: out_put: put_task_struct(task); goto out; - } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, @@ -1644,7 +1642,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } - /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, @@ -1827,10 +1824,10 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) * policy_node() is always coupled with policy_nodemask(), which * secures the nodemask limit for 'bind' and 'prefer-many' policy. */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) +static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid) { if (policy->mode == MPOL_PREFERRED) { - nd = first_node(policy->nodes); + nid = first_node(policy->nodes); } else { /* * __GFP_THISNODE shouldn't even be used with the bind policy @@ -1845,19 +1842,18 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) policy->home_node != NUMA_NO_NODE) return policy->home_node; - return nd; + return nid; } /* Do dynamic interleaving for a process */ -static unsigned interleave_nodes(struct mempolicy *policy) +static unsigned int interleave_nodes(struct mempolicy *policy) { - unsigned next; - struct task_struct *me = current; + unsigned int nid; - next = next_node_in(me->il_prev, policy->nodes); - if (next < MAX_NUMNODES) - me->il_prev = next; - return next; + nid = next_node_in(current->il_prev, policy->nodes); + if (nid < MAX_NUMNODES) + current->il_prev = nid; + return nid; } /* @@ -2347,7 +2343,7 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { - struct mempolicy *pol = mpol_dup(vma_policy(src)); + struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); @@ -2771,40 +2767,40 @@ put_mpol: } } -int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, struct mempolicy *npol) +int mpol_set_shared_policy(struct shared_policy *sp, + struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - if (npol) { - new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (pol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } - err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ -void mpol_free_shared_policy(struct shared_policy *p) +void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; - if (!p->root.rb_node) + if (!sp->root.rb_node) return; - write_lock(&p->lock); - next = rb_first(&p->root); + write_lock(&sp->lock); + next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - sp_delete(p, n); + sp_delete(sp, n); } - write_unlock(&p->lock); + write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING @@ -2854,7 +2850,6 @@ static inline void __init check_numabalancing_enable(void) } #endif /* CONFIG_NUMA_BALANCING */ -/* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; @@ -2917,7 +2912,6 @@ void numa_default_policy(void) /* * Parse and format mempolicy from/to strings */ - static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", @@ -2928,7 +2922,6 @@ static const char * const policy_modes[] = [MPOL_PREFERRED_MANY] = "prefer (many)", }; - #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. -- cgit From 93397c3b7684555b7cec726cd13eef6742d191fe Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:21:34 -0700 Subject: mempolicy trivia: use pgoff_t in shared mempolicy tree Prefer the more explicit "pgoff_t" to "unsigned long" when dealing with a shared mempolicy tree. Delete confusing comment about pseudo mm vmas. Link: https://lkml.kernel.org/r/5451157-3818-4af5-fd2c-5d26a5d1dc53@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 50a57f4c8013..c18f0a1b97e3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2428,8 +2428,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ -static struct sp_node * -sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +static struct sp_node *sp_lookup(struct shared_policy *sp, + pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; @@ -2483,8 +2483,8 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } /* Find shared policy intersecting idx */ -struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, + pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; @@ -2652,8 +2652,8 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, } /* Replace a policy range. */ -static int shared_policy_replace(struct shared_policy *sp, unsigned long start, - unsigned long end, struct sp_node *new) +static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, + pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; -- cgit From 35ec8fa0207b3c7f7c3c22337c9a507d7b291626 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:22:59 -0700 Subject: mempolicy: mpol_shared_policy_init() without pseudo-vma mpol_shared_policy_init() does not need to use a pseudo-vma: it can use sp_alloc() and sp_insert() directly, since the object's shared policy tree is empty and inaccessible (needing no lock) at get_inode() time. Link: https://lkml.kernel.org/r/3bef62d8-ae78-4c2-533-56a44ae425c@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c18f0a1b97e3..b6461d7d200e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2736,30 +2736,30 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) rwlock_init(&sp->lock); if (mpol) { - struct vm_area_struct pvma; - struct mempolicy *new; + struct sp_node *sn; + struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; - /* contextualize the tmpfs mount point mempolicy */ - new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) + + /* contextualize the tmpfs mount point mempolicy to this file */ + npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); + if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); + ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) - goto put_new; - - /* Create pseudo-vma that contains just the policy */ - vma_init(&pvma, NULL); - pvma.vm_end = TASK_SIZE; /* policy covers entire file */ - mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ - -put_new: - mpol_put(new); /* drop initial ref */ + goto put_npol; + + /* alloc node covering entire file; adds ref to file's npol */ + sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); + if (sn) + sp_insert(sp, sn); +put_npol: + mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: -- cgit From 2cafb582173f3870240af90de3f31d18b0728882 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:24:18 -0700 Subject: mempolicy: remove confusing MPOL_MF_LAZY dead code v3.8 commit b24f53a0bea3 ("mm: mempolicy: Add MPOL_MF_LAZY") introduced MPOL_MF_LAZY, and included it in the MPOL_MF_VALID flags; but a720094ded8 ("mm: mempolicy: Hide MPOL_NOOP and MPOL_MF_LAZY from userspace for now") immediately removed it from MPOL_MF_VALID flags, pending further review. "This will need to be revisited", but it has not been reinstated. The present state is confusing: there is dead code in mm/mempolicy.c to handle MPOL_MF_LAZY cases which can never occur. Remove that: it can be resurrected later if necessary. But keep the definition of MPOL_MF_LAZY, which must remain in the UAPI, even though it always fails with EINVAL. https://lore.kernel.org/linux-mm/1553041659-46787-1-git-send-email-yang.shi@linux.alibaba.com/ links to a previous request to remove MPOL_MF_LAZY. Link: https://lkml.kernel.org/r/80c9665c-1c3f-17ba-21a3-f6115cebf7d@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Yang Shi Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b6461d7d200e..e01d3f807f77 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,12 +636,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, return nr_updated; } -#else -static unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - return 0; -} #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, @@ -680,14 +674,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (endvma > end) endvma = end; - if (flags & MPOL_MF_LAZY) { - /* Similar to task_numa_work, skip inaccessible VMAs */ - if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && - !(vma->vm_flags & VM_MIXEDMAP)) - change_prot_numa(vma, start, endvma); - return 1; - } - /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. @@ -1254,9 +1240,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); - if (flags & MPOL_MF_LAZY) - new->flags |= MPOL_F_MOF; - /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1301,7 +1284,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (!err) { if (!list_empty(&pagelist)) { - WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed |= migrate_pages(&pagelist, new_folio, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } -- cgit From 23e4883248f0472d806c8b3422ba6257e67bf1a5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:25:33 -0700 Subject: mm: add page_rmappable_folio() wrapper folio_prep_large_rmappable() is being used repeatedly along with a conversion from page to folio, a check non-NULL, a check order > 1: wrap it all up into struct folio *page_rmappable_folio(struct page *). Link: https://lkml.kernel.org/r/8d92c6cf-eebe-748-e29c-c8ab224c741@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/internal.h | 9 +++++++++ mm/mempolicy.c | 17 +++-------------- mm/page_alloc.c | 8 ++------ 3 files changed, 14 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 3eceae1ec4c0..b61034bd50f5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -415,6 +415,15 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) void folio_undo_large_rmappable(struct folio *folio); +static inline struct folio *page_rmappable_folio(struct page *page) +{ + struct folio *folio = (struct folio *)page; + + if (folio && folio_order(folio) > 1) + folio_prep_large_rmappable(folio); + return folio; +} + static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e01d3f807f77..596d580f92d1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2122,10 +2122,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, mpol_cond_put(pol); gfp |= __GFP_COMP; page = alloc_page_interleave(gfp, order, nid); - folio = (struct folio *)page; - if (folio && order > 1) - folio_prep_large_rmappable(folio); - goto out; + return page_rmappable_folio(page); } if (pol->mode == MPOL_PREFERRED_MANY) { @@ -2135,10 +2132,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, gfp |= __GFP_COMP; page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); - folio = (struct folio *)page; - if (folio && order > 1) - folio_prep_large_rmappable(folio); - goto out; + return page_rmappable_folio(page); } if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { @@ -2232,12 +2226,7 @@ EXPORT_SYMBOL(alloc_pages); struct folio *folio_alloc(gfp_t gfp, unsigned order) { - struct page *page = alloc_pages(gfp | __GFP_COMP, order); - struct folio *folio = (struct folio *)page; - - if (folio && order > 1) - folio_prep_large_rmappable(folio); - return folio; + return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } EXPORT_SYMBOL(folio_alloc); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5f46c7f85cb1..733732e7e0ba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4598,12 +4598,8 @@ struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { struct page *page = __alloc_pages(gfp | __GFP_COMP, order, - preferred_nid, nodemask); - struct folio *folio = (struct folio *)page; - - if (folio && order > 1) - folio_prep_large_rmappable(folio); - return folio; + preferred_nid, nodemask); + return page_rmappable_folio(page); } EXPORT_SYMBOL(__folio_alloc); -- cgit From ddc1a5cbc05dc62743a2f409b96faa5cf95ba064 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 Oct 2023 13:39:08 -0700 Subject: mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Huang Ying Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Cc: Domenico Cerasuolo Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/mempolicy.c | 379 +++++++++++++++++++++++--------------------------------- mm/shmem.c | 92 ++++++++------ mm/swap.h | 9 +- mm/swap_state.c | 86 ++++++++----- mm/zswap.c | 7 +- 5 files changed, 270 insertions(+), 303 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 596d580f92d1..ded5508f0972 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -898,6 +898,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } if (flags & MPOL_F_ADDR) { + pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We @@ -909,10 +910,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, mmap_read_unlock(mm); return -EFAULT; } - if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); - else - pol = vma->vm_policy; + pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; @@ -1170,6 +1168,15 @@ static struct folio *new_folio(struct folio *src, unsigned long start) break; } + /* + * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL + * when the page can no longer be located in a vma: that is not ideal + * (migrate_pages() will give up early, presuming ENOMEM), but good + * enough to avoid a crash by syzkaller or concurrent holepunch. + */ + if (!vma) + return NULL; + if (folio_test_hugetlb(src)) { return alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); @@ -1178,9 +1185,6 @@ static struct folio *new_folio(struct folio *src, unsigned long start) if (folio_test_large(src)) gfp = GFP_TRANSHUGE; - /* - * if !vma, vma_alloc_folio() will use task or system default policy - */ return vma_alloc_folio(gfp, folio_order(src), vma, address, folio_test_large(src)); } @@ -1690,34 +1694,19 @@ bool vma_migratable(struct vm_area_struct *vma) } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct mempolicy *pol = NULL; - - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - pol = vma->vm_ops->get_policy(vma, addr); - } else if (vma->vm_policy) { - pol = vma->vm_policy; - - /* - * shmem_alloc_page() passes MPOL_F_SHARED policy with - * a pseudo vma whose vma->vm_ops=NULL. Take a reference - * count on these policies which will be dropped by - * mpol_cond_put() later - */ - if (mpol_needs_cond_ref(pol)) - mpol_get(pol); - } - } - - return pol; + *ilx = 0; + return (vma->vm_ops && vma->vm_ops->get_policy) ? + vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* - * get_vma_policy(@vma, @addr) + * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup + * @order: 0, or appropriate huge_page_order for interleaving + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. @@ -1726,14 +1715,18 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) { - struct mempolicy *pol = __get_vma_policy(vma, addr); + struct mempolicy *pol; + pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); - + if (pol->mode == MPOL_INTERLEAVE) { + *ilx += vma->vm_pgoff >> order; + *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); + } return pol; } @@ -1743,8 +1736,9 @@ bool vma_policy_mof(struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; + pgoff_t ilx; /* ignored here */ - pol = vma->vm_ops->get_policy(vma, vma->vm_start); + pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); @@ -1779,54 +1773,6 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) return zone >= dynamic_policy_zone; } -/* - * Return a nodemask representing a mempolicy for filtering nodes for - * page allocation - */ -nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) -{ - int mode = policy->mode; - - /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) - return &policy->nodes; - - if (mode == MPOL_PREFERRED_MANY) - return &policy->nodes; - - return NULL; -} - -/* - * Return the preferred node id for 'prefer' mempolicy, and return - * the given id for all other policies. - * - * policy_node() is always coupled with policy_nodemask(), which - * secures the nodemask limit for 'bind' and 'prefer-many' policy. - */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid) -{ - if (policy->mode == MPOL_PREFERRED) { - nid = first_node(policy->nodes); - } else { - /* - * __GFP_THISNODE shouldn't even be used with the bind policy - * because we might easily break the expectation to stay on the - * requested node and not break the policy. - */ - WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); - } - - if ((policy->mode == MPOL_BIND || - policy->mode == MPOL_PREFERRED_MANY) && - policy->home_node != NUMA_NO_NODE) - return policy->home_node; - - return nid; -} - /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { @@ -1886,11 +1832,11 @@ unsigned int mempolicy_slab_node(void) } /* - * Do static interleaving for a VMA with known offset @n. Returns the n'th - * node in pol->nodes (starting from n=0), wrapping around if n exceeds the - * number of present nodes. + * Do static interleaving for interleave index @ilx. Returns the ilx'th + * node in pol->nodes (starting from ilx=0), wrapping around if ilx + * exceeds the number of present nodes. */ -static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) +static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask = pol->nodes; unsigned int target, nnodes; @@ -1908,33 +1854,54 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); - target = (unsigned int)n % nnodes; + target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } -/* Determine a node number for interleave */ -static inline unsigned interleave_nid(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long addr, int shift) +/* + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation, together with preferred node id (or the input node id). + */ +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid) { - if (vma) { - unsigned long off; + nodemask_t *nodemask = NULL; + switch (pol->mode) { + case MPOL_PREFERRED: + /* Override input node id */ + *nid = first_node(pol->nodes); + break; + case MPOL_PREFERRED_MANY: + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; + break; + case MPOL_BIND: + /* Restrict to nodemask (but not on lower zones) */ + if (apply_policy_zone(pol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&pol->nodes)) + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; /* - * for small pages, there is no difference between - * shift and PAGE_SHIFT, so the bit-shift is safe. - * for huge pages, since vm_pgoff is in units of small - * pages, we need to shift off the always 0 bits to get - * a useful offset. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - BUG_ON(shift < PAGE_SHIFT); - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); - off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, off); - } else - return interleave_nodes(pol); + WARN_ON_ONCE(gfp & __GFP_THISNODE); + break; + case MPOL_INTERLEAVE: + /* Override input node id */ + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + interleave_nodes(pol) : interleave_nid(pol, ilx); + break; + } + + return nodemask; } #ifdef CONFIG_HUGETLBFS @@ -1950,27 +1917,16 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. - * - * Must be protected by read_mems_allowed_begin() */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, - struct mempolicy **mpol, nodemask_t **nodemask) + struct mempolicy **mpol, nodemask_t **nodemask) { + pgoff_t ilx; int nid; - int mode; - *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; - mode = (*mpol)->mode; - - if (unlikely(mode == MPOL_INTERLEAVE)) { - nid = interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))); - } else { - nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) - *nodemask = &(*mpol)->nodes; - } + nid = numa_node_id(); + *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); + *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } @@ -2048,27 +2004,8 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk, return ret; } -/* Allocate a page in interleaved policy. - Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, - unsigned nid) -{ - struct page *page; - - page = __alloc_pages(gfp, order, nid, NULL); - /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ - if (!static_branch_likely(&vm_numa_stat_key)) - return page; - if (page && page_to_nid(page) == nid) { - preempt_disable(); - __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); - preempt_enable(); - } - return page; -} - static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, - int nid, struct mempolicy *pol) + int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; @@ -2081,7 +2018,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + page = __alloc_pages(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_pages(gfp, order, nid, NULL); @@ -2089,55 +2026,29 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, } /** - * vma_alloc_folio - Allocate a folio for a VMA. + * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. - * @order: Order of the folio. - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual address of the allocation. Must be inside @vma. - * @hugepage: For hugepages try only the preferred node if possible. - * - * Allocate a folio for a specific address in @vma, using the appropriate - * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock - * of the mm_struct of the VMA to prevent it from going away. Should be - * used for all allocations for folios that will be mapped into user space. + * @order: Order of the page allocation. + * @pol: Pointer to the NUMA mempolicy. + * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). + * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * - * Return: The folio on success or NULL if allocation fails. + * Return: The page on success or NULL if allocation fails. */ -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) { - struct mempolicy *pol; - int node = numa_node_id(); - struct folio *folio; - int preferred_nid; - nodemask_t *nmask; - - pol = get_vma_policy(vma, addr); - - if (pol->mode == MPOL_INTERLEAVE) { - struct page *page; - unsigned nid; - - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); - mpol_cond_put(pol); - gfp |= __GFP_COMP; - page = alloc_page_interleave(gfp, order, nid); - return page_rmappable_folio(page); - } - - if (pol->mode == MPOL_PREFERRED_MANY) { - struct page *page; + nodemask_t *nodemask; + struct page *page; - node = policy_node(gfp, pol, node); - gfp |= __GFP_COMP; - page = alloc_pages_preferred_many(gfp, order, node, pol); - mpol_cond_put(pol); - return page_rmappable_folio(page); - } + nodemask = policy_nodemask(gfp, pol, ilx, &nid); - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_preferred_many(gfp, order, nid, nodemask); + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + /* filter "hugepage" allocation, unless from alloc_pages() */ + order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred @@ -2148,39 +2059,68 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED) - hpage_node = first_node(pol->nodes); - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); + if (pol->mode != MPOL_INTERLEAVE && + (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - folio = __folio_alloc_node(gfp | __GFP_THISNODE | - __GFP_NORETRY, order, hpage_node); - + page = __alloc_pages_node(nid, + gfp | __GFP_THISNODE | __GFP_NORETRY, order); + if (page || !(gfp & __GFP_DIRECT_RECLAIM)) + return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ - if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) - folio = __folio_alloc(gfp, order, hpage_node, - nmask); + } + } - goto out; + page = __alloc_pages(gfp, order, nid, nodemask); + + if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { + /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ + if (static_branch_likely(&vm_numa_stat_key) && + page_to_nid(page) == nid) { + preempt_disable(); + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); + preempt_enable(); } } - nmask = policy_nodemask(gfp, pol); - preferred_nid = policy_node(gfp, pol, node); - folio = __folio_alloc(gfp, order, preferred_nid, nmask); + return page; +} + +/** + * vma_alloc_folio - Allocate a folio for a VMA. + * @gfp: GFP flags. + * @order: Order of the folio. + * @vma: Pointer to VMA. + * @addr: Virtual address of the allocation. Must be inside @vma. + * @hugepage: Unused (was: For hugepages try only preferred node if possible). + * + * Allocate a folio for a specific address in @vma, using the appropriate + * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the + * VMA to prevent it from going away. Should be used for all allocations + * for folios that will be mapped into user space, excepting hugetlbfs, and + * excepting where direct use of alloc_pages_mpol() is more appropriate. + * + * Return: The folio on success or NULL if allocation fails. + */ +struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, bool hugepage) +{ + struct mempolicy *pol; + pgoff_t ilx; + struct page *page; + + pol = get_vma_policy(vma, addr, order, &ilx); + page = alloc_pages_mpol(gfp | __GFP_COMP, order, + pol, ilx, numa_node_id()); mpol_cond_put(pol); -out: - return folio; + return page_rmappable_folio(page); } EXPORT_SYMBOL(vma_alloc_folio); @@ -2198,33 +2138,23 @@ EXPORT_SYMBOL(vma_alloc_folio); * flags are used. * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages(gfp_t gfp, unsigned order) +struct page *alloc_pages(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; - struct page *page; - - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) - pol = get_task_policy(current); /* * No reference counting needed for current->mempolicy * nor system default_policy */ - if (pol->mode == MPOL_INTERLEAVE) - page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); - else if (pol->mode == MPOL_PREFERRED_MANY) - page = alloc_pages_preferred_many(gfp, order, - policy_node(gfp, pol, numa_node_id()), pol); - else - page = __alloc_pages(gfp, order, - policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol)); + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); - return page; + return alloc_pages_mpol(gfp, order, + pol, NO_INTERLEAVE_INDEX, numa_node_id()); } EXPORT_SYMBOL(alloc_pages); -struct folio *folio_alloc(gfp_t gfp, unsigned order) +struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } @@ -2295,6 +2225,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; + nodemask_t *nodemask; + int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); @@ -2307,9 +2239,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); - return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol), nr_pages, NULL, - page_array); + nid = numa_node_id(); + nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); + return __alloc_pages_bulk(gfp, nid, nodemask, + nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) @@ -2496,23 +2429,21 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; + pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); - unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; - pol = get_vma_policy(vma, addr); + pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: - pgoff = vma->vm_pgoff; - pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - polnid = offset_il_node(pol, pgoff); + polnid = interleave_nid(pol, ilx); break; case MPOL_PREFERRED: diff --git a/mm/shmem.c b/mm/shmem.c index bcbe9dbb9f5f..a314a25aea8c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1544,38 +1544,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ -#ifndef CONFIG_NUMA -#define vm_policy vm_private_data -#endif - -static void shmem_pseudo_vma_init(struct vm_area_struct *vma, - struct shmem_inode_info *info, pgoff_t index) -{ - /* Create a pseudo vma that just contains the policy */ - vma_init(vma, NULL); - /* Bias interleave by inode number to distribute better across nodes */ - vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); -} -static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) -{ - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(vma->vm_policy); -} +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx); -static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; + struct mempolicy *mpol; + pgoff_t ilx; struct page *page; - struct vm_fault vmf = { - .vma = &pvma, - }; - shmem_pseudo_vma_init(&pvma, info, index); - page = swap_cluster_readahead(swap, gfp, &vmf); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + page = swap_cluster_readahead(swap, gfp, mpol, ilx); + mpol_cond_put(mpol); if (!page) return NULL; @@ -1609,27 +1591,29 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) static struct folio *shmem_alloc_hugefolio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct folio *folio; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); + page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - return folio; + return page_rmappable_folio(page); } static struct folio *shmem_alloc_folio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct folio *folio; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - return folio; + return (struct folio *)page; } static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, @@ -1883,7 +1867,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(fault_mm, PGMAJFAULT); } /* Here we actually start the io */ - folio = shmem_swapin(swap, gfp, info, index); + folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; @@ -2334,15 +2318,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; + /* + * Bias interleave by inode number to distribute better across nodes; + * but this interface is independent of which page order is used, so + * supplies only that bias, letting caller apply the offset (adjusted + * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). + */ + *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } -#endif + +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + struct mempolicy *mpol; + + /* Bias interleave by inode number to distribute better across nodes */ + *ilx = info->vfs_inode.i_ino + (index >> order); + + mpol = mpol_shared_policy_lookup(&info->policy, index); + return mpol ? mpol : get_task_policy(current); +} +#else +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} +#endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { diff --git a/mm/swap.h b/mm/swap.h index 8a3c7a0ace4f..73c332ee4d91 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -2,6 +2,8 @@ #ifndef _MM_SWAP_H #define _MM_SWAP_H +struct mempolicy; + #ifdef CONFIG_SWAP #include /* for bio_end_io_t */ @@ -48,11 +50,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, unsigned long addr, struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, - unsigned long addr, + struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated); struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); + struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); @@ -80,7 +81,7 @@ static inline void show_swap_cache_info(void) } static inline struct page *swap_cluster_readahead(swp_entry_t entry, - gfp_t gfp_mask, struct vm_fault *vmf) + gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { return NULL; } diff --git a/mm/swap_state.c b/mm/swap_state.c index ab79ffb71736..85d9e5806a6a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -410,8 +411,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, } struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr, - bool *new_page_allocated) + struct mempolicy *mpol, pgoff_t ilx, + bool *new_page_allocated) { struct swap_info_struct *si; struct folio *folio; @@ -453,7 +454,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, + mpol, ilx, numa_node_id()); if (!folio) goto fail_put_swap; @@ -528,14 +530,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { - bool page_was_allocated; - struct page *retpage = __read_swap_cache_async(entry, gfp_mask, - vma, addr, &page_was_allocated); + bool page_allocated; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - if (page_was_allocated) - swap_readpage(retpage, false, plug); + mpol = get_vma_policy(vma, addr, 0, &ilx); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + mpol_cond_put(mpol); - return retpage; + if (page_allocated) + swap_readpage(page, false, plug); + return page; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, @@ -603,7 +610,8 @@ static unsigned long swapin_nr_pages(unsigned long offset) * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags - * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct page for entry and addr, after queueing swapin. * @@ -612,13 +620,12 @@ static unsigned long swapin_nr_pages(unsigned long offset) * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold read mmap_lock if vmf->vma is not NULL. + * Note: it is intentional that the same NUMA policy and interleave index + * are used for every page of the readahead: neighbouring pages on swap + * are fairly likely to have been swapped out from the same node. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_fault *vmf) + struct mempolicy *mpol, pgoff_t ilx) { struct page *page; unsigned long entry_offset = swp_offset(entry); @@ -629,8 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; - struct vm_area_struct *vma = vmf->vma; - unsigned long addr = vmf->address; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -648,8 +653,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( - swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr, &page_allocated); + swp_entry(swp_type(entry), offset), + gfp_mask, mpol, ilx, &page_allocated); if (!page) continue; if (page_allocated) { @@ -663,11 +668,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, } blk_finish_plug(&plug); swap_read_unplug(splug); - lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_readpage(page, false, NULL); + return page; } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -765,8 +773,10 @@ static void swap_ra_info(struct vm_fault *vmf, /** * swap_vma_readahead - swap in pages in hope we need them soon - * @fentry: swap entry of this memory + * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags + * @mpol: NUMA memory allocation policy to be applied + * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. @@ -777,16 +787,17 @@ static void swap_ra_info(struct vm_fault *vmf, * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ -static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, +static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; - struct vm_area_struct *vma = vmf->vma; struct page *page; pte_t *pte = NULL, pentry; unsigned long addr; swp_entry_t entry; + pgoff_t ilx; unsigned int i; bool page_allocated; struct vma_swap_readahead ra_info = { @@ -798,9 +809,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, goto skip; addr = vmf->address - (ra_info.offset * PAGE_SIZE); + ilx = targ_ilx - ra_info.offset; blk_start_plug(&plug); - for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { + for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -814,8 +826,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; - page = __read_swap_cache_async(entry, gfp_mask, vma, - addr, &page_allocated); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); if (!page) continue; if (page_allocated) { @@ -834,8 +846,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, lru_add_drain(); skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - NULL); + page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_readpage(page, false, NULL); + return page; } /** @@ -853,9 +868,16 @@ skip: struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { - return swap_use_vma_readahead() ? - swap_vma_readahead(entry, gfp_mask, vmf) : - swap_cluster_readahead(entry, gfp_mask, vmf); + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; + + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); + page = swap_use_vma_readahead() ? + swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : + swap_cluster_readahead(entry, gfp_mask, mpol, ilx); + mpol_cond_put(mpol); + return page; } #ifdef CONFIG_SYSFS diff --git a/mm/zswap.c b/mm/zswap.c index 37d2b1cb2ecb..060857adca76 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1057,6 +1058,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, { swp_entry_t swpentry = entry->swpentry; struct page *page; + struct mempolicy *mpol; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct zpool *pool = zswap_find_zpool(entry); @@ -1075,8 +1077,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry, } /* try to allocate swap cache page */ - page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, - &page_was_allocated); + mpol = get_task_policy(current); + page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &page_was_allocated); if (!page) { ret = -ENOMEM; goto fail; -- cgit From 72e315f7a750281b4410ac30d8930f735459e72d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:27:47 -0700 Subject: mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/hugetlb.c | 38 ++++++++++++++------------- mm/mempolicy.c | 83 ++++++++++++++++++++++++++++++---------------------------- 2 files changed, 63 insertions(+), 58 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd8065e36038..1169ef2f2176 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2630,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } -/* mempolicy aware migration callback */ -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address) -{ - struct mempolicy *mpol; - nodemask_t *nodemask; - struct folio *folio; - gfp_t gfp_mask; - int node; - - gfp_mask = htlb_alloc_mask(h); - node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); - mpol_cond_put(mpol); - - return folio; -} - /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -6559,6 +6541,26 @@ out_mutex: } #ifdef CONFIG_USERFAULTFD +/* + * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). + */ +static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct folio *folio; + gfp_t gfp_mask; + int node; + + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + mpol_cond_put(mpol); + + return folio; +} + /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ded5508f0972..0594362f2470 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -415,6 +415,8 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { @@ -1021,6 +1023,8 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + + mmap_read_lock(mm); vma = find_vma(mm, 0); /* @@ -1031,6 +1035,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, @@ -1059,8 +1064,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, lru_cache_disable(); - mmap_read_lock(mm); - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -1140,7 +1143,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (err < 0) break; } - mmap_read_unlock(mm); lru_cache_enable(); if (err < 0) @@ -1149,44 +1151,38 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, } /* - * Allocate a new page for page migration based on vma policy. - * Start by assuming the page is mapped by the same vma as contains @start. - * Search forward from there, if not. N.B., this assumes that the - * list of pages handed to migrate_pages()--which is how we get here-- - * is in virtual address order. + * Allocate a new folio for page migration, according to NUMA mempolicy. */ -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { - struct vm_area_struct *vma; - unsigned long address; - VMA_ITERATOR(vmi, current->mm, start); - gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; - - for_each_vma(vmi, vma) { - address = page_address_in_vma(&src->page, vma); - if (address != -EFAULT) - break; - } + struct mempolicy *pol = (struct mempolicy *)private; + pgoff_t ilx = 0; /* improve on this later */ + struct page *page; + unsigned int order; + int nid = numa_node_id(); + gfp_t gfp; - /* - * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL - * when the page can no longer be located in a vma: that is not ideal - * (migrate_pages() will give up early, presuming ENOMEM), but good - * enough to avoid a crash by syzkaller or concurrent holepunch. - */ - if (!vma) - return NULL; + order = folio_order(src); + ilx += src->index >> order; if (folio_test_hugetlb(src)) { - return alloc_hugetlb_folio_vma(folio_hstate(src), - vma, address); + nodemask_t *nodemask; + struct hstate *h; + + h = folio_hstate(src); + gfp = htlb_alloc_mask(h); + nodemask = policy_nodemask(gfp, pol, ilx, &nid); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; + else + gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; - return vma_alloc_folio(gfp, folio_order(src), vma, address, - folio_test_large(src)); + page = alloc_pages_mpol(gfp, order, pol, ilx, nid); + return page_rmappable_folio(page); } #else @@ -1202,7 +1198,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { return NULL; } @@ -1276,6 +1273,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (nr_failed < 0) { err = nr_failed; + nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); @@ -1286,19 +1284,24 @@ static long do_mbind(unsigned long start, unsigned long len, } } - if (!err) { - if (!list_empty(&pagelist)) { - nr_failed |= migrate_pages(&pagelist, new_folio, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); + mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { + /* Convert MPOL_DEFAULT's NULL to task or default policy */ + if (!new) { + new = get_task_policy(current); + mpol_get(new); } - if (nr_failed && (flags & MPOL_MF_STRICT)) - err = -EIO; + nr_failed |= migrate_pages(&pagelist, + alloc_migration_target_by_mpol, NULL, + (unsigned long)new, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND, NULL); } + if (nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - - mmap_write_unlock(mm); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) -- cgit From 88c91dc58582f1d0c6f5f501051b0262d06ec4ed Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:29:00 -0700 Subject: mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0594362f2470..5e472e6e0507 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -428,6 +428,11 @@ static bool strictly_unmovable(unsigned long flags) MPOL_MF_STRICT; } +struct migration_mpol { /* for alloc_migration_target_by_mpol() */ + struct mempolicy *pol; + pgoff_t ilx; +}; + struct queue_pages { struct list_head *pagelist; unsigned long flags; @@ -1156,8 +1161,9 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { - struct mempolicy *pol = (struct mempolicy *)private; - pgoff_t ilx = 0; /* improve on this later */ + struct migration_mpol *mmpol = (struct migration_mpol *)private; + struct mempolicy *pol = mmpol->pol; + pgoff_t ilx = mmpol->ilx; struct page *page; unsigned int order; int nid = numa_node_id(); @@ -1212,6 +1218,7 @@ static long do_mbind(unsigned long start, unsigned long len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; + struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; long err; @@ -1284,17 +1291,55 @@ static long do_mbind(unsigned long start, unsigned long len, } } - mmap_write_unlock(mm); - if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { new = get_task_policy(current); mpol_get(new); } + mmpol.pol = new; + mmpol.ilx = 0; + + /* + * In the interleaved case, attempt to allocate on exactly the + * targeted nodes, for the first VMA to be migrated; for later + * VMAs, the nodes will still be interleaved from the targeted + * nodemask, but one by one may be selected differently. + */ + if (new->mode == MPOL_INTERLEAVE) { + struct page *page; + unsigned int order; + unsigned long addr = -EFAULT; + + list_for_each_entry(page, &pagelist, lru) { + if (!PageKsm(page)) + break; + } + if (!list_entry_is_head(page, &pagelist, lru)) { + vma_iter_init(&vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + addr = page_address_in_vma(page, vma); + if (addr != -EFAULT) + break; + } + } + if (addr != -EFAULT) { + order = compound_order(page); + /* We already know the pol, but not the ilx */ + mpol_cond_put(get_vma_policy(vma, addr, order, + &mmpol.ilx)); + /* Set base from which to increment by index */ + mmpol.ilx -= page->index >> order; + } + } + } + + mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { nr_failed |= migrate_pages(&pagelist, alloc_migration_target_by_mpol, NULL, - (unsigned long)new, MIGRATE_SYNC, + (unsigned long)&mmpol, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } -- cgit From cb61dad80fdc6a8f01b101e823a9335cd6d61071 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 24 Oct 2023 16:45:09 -0700 Subject: zswap: export compression failure stats During a zswap store attempt, the compression algorithm could fail (for e.g due to the page containing incompressible random data). This is not tracked in any of existing zswap counters, making it hard to monitor for and investigate. We have run into this problem several times in our internal investigations on zswap store failures. This patch adds a dedicated debugfs counter for compression algorithm failures. Link: https://lkml.kernel.org/r/20231024234509.2680539-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 060857adca76..74411dfdad92 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -62,6 +62,8 @@ static u64 zswap_pool_limit_hit; static u64 zswap_written_back_pages; /* Store failed due to a reclaim failure after pool limit was reached */ static u64 zswap_reject_reclaim_fail; +/* Store failed due to compression algorithm failure */ +static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; /* Store failed because underlying allocator could not get memory */ @@ -1312,8 +1314,10 @@ bool zswap_store(struct folio *folio) ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (ret) + if (ret) { + zswap_reject_compress_fail++; goto put_dstmem; + } /* store */ zpool = zswap_find_zpool(entry); @@ -1553,6 +1557,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_alloc_fail); debugfs_create_u64("reject_kmemcache_fail", 0444, zswap_debugfs_root, &zswap_reject_kmemcache_fail); + debugfs_create_u64("reject_compress_fail", 0444, + zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, -- cgit From ca6c2ce1b481996ae5b16e4589d3c0dd61899fa8 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 18 Oct 2023 22:50:14 +0800 Subject: mm/vmalloc: fix the unchecked dereference warning in vread_iter() LKP reported smatch warning as below: =================== smatch warnings: mm/vmalloc.c:3689 vread_iter() error: we previously assumed 'vm' could be null (see line 3667) ...... 06c8994626d1b7 @3667 size = vm ? get_vm_area_size(vm) : va_size(va); ...... 06c8994626d1b7 @3689 else if (!(vm->flags & VM_IOREMAP)) ^^^^^^^^^ Unchecked dereference ===================== This is not a runtime bug because the possible null 'vm' in the pointed place could only happen when flags == VMAP_BLOCK. However, the case 'flags == VMAP_BLOCK' should never happen and has been detected with WARN_ON. Please check vm_map_ram() implementation and the earlier checking in vread_iter() at below: ~~~~~~~~~~~~~~~~~~~~~~~~~~ /* * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need * be set together with VMAP_RAM. */ WARN_ON(flags == VMAP_BLOCK); if (!vm && !flags) continue; ~~~~~~~~~~~~~~~~~~~~~~~~~~ So add checking on whether 'vm' could be null when dereferencing it in vread_iter(). This mutes smatch complaint. Link: https://lkml.kernel.org/r/ZTCURc8ZQE+KrTvS@MiWiFi-R3L-srv Link: https://lkml.kernel.org/r/ZS/2k6DIMd0tZRgK@MiWiFi-R3L-srv Signed-off-by: Baoquan He Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202310171600.WCrsOwFj-lkp@intel.com/ Cc: Lorenzo Stoakes Cc: Philip Li Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3fedb3ee0db..d12a17fc0c17 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3809,7 +3809,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if (flags & VMAP_RAM) copied = vmap_ram_vread_iter(iter, addr, n, flags); - else if (!(vm->flags & VM_IOREMAP)) + else if (!(vm && (vm->flags & VM_IOREMAP))) copied = aligned_vread_iter(iter, addr, n); else /* IOREMAP area is treated as memory hole */ copied = zero_iter(iter, n); -- cgit From 19467a950b49432a84bf6dbadbbb17bdf89418b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Oct 2023 21:07:33 +0000 Subject: mm/damon/sysfs: remove requested targets when online-commit inputs damon_sysfs_set_targets(), which updates the targets of the context for online commitment, do not remove targets that removed from the corresponding sysfs files. As a result, more than intended targets of the context can exist and hence consume memory and monitoring CPU resource more than expected. Fix it by removing all targets of the context and fill up again using the user input. This could cause unnecessary memory dealloc and realloc operations, but this is not a hot code path. Also, note that damon_target is stateless, and hence no data is lost. [sj@kernel.org: fix unnecessary monitoring results removal] Link: https://lkml.kernel.org/r/20231028213353.45397-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231022210735.46409-2-sj@kernel.org Fixes: da87878010e5 ("mm/damon/sysfs: support online inputs update") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: [5.19.x] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 70 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 5846970cedce..1a231bde18f9 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1150,58 +1150,60 @@ destroy_targets_out: return err; } -/* - * Search a target in a context that corresponds to the sysfs target input. - * - * Return: pointer to the target if found, NULL if not found, or negative - * error code if the search failed. - */ -static struct damon_target *damon_sysfs_existing_target( - struct damon_sysfs_target *sys_target, struct damon_ctx *ctx) +static int damon_sysfs_update_target(struct damon_target *target, + struct damon_ctx *ctx, + struct damon_sysfs_target *sys_target) { struct pid *pid; - struct damon_target *t; + struct damon_region *r, *next; - if (!damon_target_has_pid(ctx)) { - /* Up to only one target for paddr could exist */ - damon_for_each_target(t, ctx) - return t; - return NULL; - } + if (!damon_target_has_pid(ctx)) + return 0; - /* ops.id should be DAMON_OPS_VADDR or DAMON_OPS_FVADDR */ pid = find_get_pid(sys_target->pid); if (!pid) - return ERR_PTR(-EINVAL); - damon_for_each_target(t, ctx) { - if (t->pid == pid) { - put_pid(pid); - return t; - } + return -EINVAL; + + /* no change to the target */ + if (pid == target->pid) { + put_pid(pid); + return 0; } - put_pid(pid); - return NULL; + + /* remove old monitoring results and update the target's pid */ + damon_for_each_region_safe(r, next, target) + damon_destroy_region(r, target); + put_pid(target->pid); + target->pid = pid; + return 0; } static int damon_sysfs_set_targets(struct damon_ctx *ctx, struct damon_sysfs_targets *sysfs_targets) { - int i, err; + struct damon_target *t, *next; + int i = 0, err; /* Multiple physical address space monitoring targets makes no sense */ if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1) return -EINVAL; - for (i = 0; i < sysfs_targets->nr; i++) { + damon_for_each_target_safe(t, next, ctx) { + if (i < sysfs_targets->nr) { + damon_sysfs_update_target(t, ctx, + sysfs_targets->targets_arr[i]); + } else { + if (damon_target_has_pid(ctx)) + put_pid(t->pid); + damon_destroy_target(t); + } + i++; + } + + for (; i < sysfs_targets->nr; i++) { struct damon_sysfs_target *st = sysfs_targets->targets_arr[i]; - struct damon_target *t = damon_sysfs_existing_target(st, ctx); - - if (IS_ERR(t)) - return PTR_ERR(t); - if (!t) - err = damon_sysfs_add_target(st, ctx); - else - err = damon_sysfs_set_regions(t, st->regions); + + err = damon_sysfs_add_target(st, ctx); if (err) return err; } -- cgit From 9732336006764e2ee61225387e3c70eae9139035 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 31 Oct 2023 17:01:31 +0000 Subject: mm/damon/sysfs: update monitoring target regions for online input commit When user input is committed online, DAMON sysfs interface is ignoring the user input for the monitoring target regions. Such request is valid and useful for fixed monitoring target regions-based monitoring ops like 'paddr' or 'fvaddr'. Update the region boundaries as user specified, too. Note that the monitoring results of the regions that overlap between the latest monitoring target regions and the new target regions are preserved. Treat empty monitoring target regions user request as a request to just make no change to the monitoring target regions. Otherwise, users should set the monitoring target regions same to current one for every online input commit, and it could be challenging for dynamic monitoring target regions update DAMON ops like 'vaddr'. If the user really need to remove all monitoring target regions, they can simply remove the target and then create the target again with empty target regions. Link: https://lkml.kernel.org/r/20231031170131.46972-1-sj@kernel.org Fixes: da87878010e5 ("mm/damon/sysfs: support online inputs update") Signed-off-by: SeongJae Park Cc: [5.19+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1a231bde18f9..e27846708b5a 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1150,34 +1150,47 @@ destroy_targets_out: return err; } -static int damon_sysfs_update_target(struct damon_target *target, - struct damon_ctx *ctx, - struct damon_sysfs_target *sys_target) +static int damon_sysfs_update_target_pid(struct damon_target *target, int pid) { - struct pid *pid; - struct damon_region *r, *next; - - if (!damon_target_has_pid(ctx)) - return 0; + struct pid *pid_new; - pid = find_get_pid(sys_target->pid); - if (!pid) + pid_new = find_get_pid(pid); + if (!pid_new) return -EINVAL; - /* no change to the target */ - if (pid == target->pid) { - put_pid(pid); + if (pid_new == target->pid) { + put_pid(pid_new); return 0; } - /* remove old monitoring results and update the target's pid */ - damon_for_each_region_safe(r, next, target) - damon_destroy_region(r, target); put_pid(target->pid); - target->pid = pid; + target->pid = pid_new; return 0; } +static int damon_sysfs_update_target(struct damon_target *target, + struct damon_ctx *ctx, + struct damon_sysfs_target *sys_target) +{ + int err; + + if (damon_target_has_pid(ctx)) { + err = damon_sysfs_update_target_pid(target, sys_target->pid); + if (err) + return err; + } + + /* + * Do monitoring target region boundary update only if one or more + * regions are set by the user. This is for keeping current monitoring + * target results and range easier, especially for dynamic monitoring + * target regions update ops like 'vaddr'. + */ + if (sys_target->regions->nr) + err = damon_sysfs_set_regions(target, sys_target->regions); + return err; +} + static int damon_sysfs_set_targets(struct damon_ctx *ctx, struct damon_sysfs_targets *sysfs_targets) { -- cgit