From 122e093c1734361dedb64f65c99b93e28e4624f4 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:33:26 -0700 Subject: mm/page_alloc: fix memory map initialization for descending nodes On systems with memory nodes sorted in descending order, for instance Dell Precision WorkStation T5500, the struct pages for higher PFNs and respectively lower nodes, could be overwritten by the initialization of struct pages corresponding to the holes in the memory sections. For example for the below memory layout [ 0.245624] Early memory node ranges [ 0.248496] node 1: [mem 0x0000000000001000-0x0000000000090fff] [ 0.251376] node 1: [mem 0x0000000000100000-0x00000000dbdf8fff] [ 0.254256] node 1: [mem 0x0000000100000000-0x0000001423ffffff] [ 0.257144] node 0: [mem 0x0000001424000000-0x0000002023ffffff] the range 0x1424000000 - 0x1428000000 in the beginning of node 0 starts in the middle of a section and will be considered as a hole during the initialization of the last section in node 1. The wrong initialization of the memory map causes panic on boot when CONFIG_DEBUG_VM is enabled. Reorder loop order of the memory map initialization so that the outer loop will always iterate over populated memory regions in the ascending order and the inner loop will select the zone corresponding to the PFN range. This way initialization of the struct pages for the memory holes will be always done for the ranges that are actually not populated. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/YNXlMqBbL+tBG7yq@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=213073 Link: https://lkml.kernel.org/r/20210624062305.10940-1-rppt@kernel.org Fixes: 0740a50b9baa ("mm/page_alloc.c: refactor initialization of struct page for holes in memory layout") Signed-off-by: Mike Rapoport Cc: Boris Petkov Cc: Robert Shteynfeld Cc: Baoquan He Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 94 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 36 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef2265f86b91..5b5c9f5813b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6400,7 +6400,7 @@ void __ref memmap_init_zone_device(struct zone *zone, return; /* - * The call to memmap_init_zone should have already taken care + * The call to memmap_init should have already taken care * of the pages reserved for the memmap, so we can just jump to * the end of that region and start processing the device pages. */ @@ -6465,7 +6465,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during - * memmap_init_zone(). + * memmap_init_zone_range(). * * But, there could be struct pages that correspond to holes in * memblock.memory. This can happen because of the following reasons: @@ -6484,9 +6484,9 @@ static void __meminit zone_init_free_lists(struct zone *zone) * zone/node above the hole except for the trailing pages in the last * section that will be appended to the zone/node below. */ -static u64 __meminit init_unavailable_range(unsigned long spfn, - unsigned long epfn, - int zone, int node) +static void __init init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) { unsigned long pfn; u64 pgcnt = 0; @@ -6502,56 +6502,77 @@ static u64 __meminit init_unavailable_range(unsigned long spfn, pgcnt++; } - return pgcnt; + if (pgcnt) + pr_info("On node %d, zone %s: %lld pages in unavailable ranges", + node, zone_names[zone], pgcnt); } #else -static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn, - int zone, int node) +static inline void init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) { - return 0; } #endif -void __meminit __weak memmap_init_zone(struct zone *zone) +static void __init memmap_init_zone_range(struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long *hole_pfn) { unsigned long zone_start_pfn = zone->zone_start_pfn; unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; - int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); - static unsigned long hole_pfn; + int nid = zone_to_nid(zone), zone_id = zone_idx(zone); + + start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); + end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); + + if (start_pfn >= end_pfn) + return; + + memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + + if (*hole_pfn < start_pfn) + init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); + + *hole_pfn = end_pfn; +} + +static void __init memmap_init(void) +{ unsigned long start_pfn, end_pfn; - u64 pgcnt = 0; + unsigned long hole_pfn = 0; + int i, j, zone_id, nid; - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); - end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + struct pglist_data *node = NODE_DATA(nid); + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = node->node_zones + j; - if (end_pfn > start_pfn) - memmap_init_range(end_pfn - start_pfn, nid, - zone_id, start_pfn, zone_end_pfn, - MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + if (!populated_zone(zone)) + continue; - if (hole_pfn < start_pfn) - pgcnt += init_unavailable_range(hole_pfn, start_pfn, - zone_id, nid); - hole_pfn = end_pfn; + memmap_init_zone_range(zone, start_pfn, end_pfn, + &hole_pfn); + zone_id = j; + } } #ifdef CONFIG_SPARSEMEM /* - * Initialize the hole in the range [zone_end_pfn, section_end]. - * If zone boundary falls in the middle of a section, this hole - * will be re-initialized during the call to this function for the - * higher zone. + * Initialize the memory map for hole in the range [memory_end, + * section_end]. + * Append the pages in this hole to the highest zone in the last + * node. + * The call to init_unavailable_range() is outside the ifdef to + * silence the compiler warining about zone_id set but not used; + * for FLATMEM it is a nop anyway */ - end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION); + end_pfn = round_up(end_pfn, PAGES_PER_SECTION); if (hole_pfn < end_pfn) - pgcnt += init_unavailable_range(hole_pfn, end_pfn, - zone_id, nid); #endif - - if (pgcnt) - pr_info(" %s zone: %llu pages in unavailable ranges\n", - zone->name, pgcnt); + init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } static int zone_batchsize(struct zone *zone) @@ -7254,7 +7275,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(zone); init_currently_empty_zone(zone, zone->zone_start_pfn, size); - memmap_init_zone(zone); } } @@ -7780,6 +7800,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } + + memmap_init(); } static int __init cmdline_parse_core(char *p, unsigned long *core, -- cgit From ff4b2b4014cbffb3d32b22629252f4dc8616b0fe Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:33:29 -0700 Subject: mm/page_alloc: correct return value of populated elements if bulk array is populated Dave Jones reported the following This made it into 5.13 final, and completely breaks NFSD for me (Serving tcp v3 mounts). Existing mounts on clients hang, as do new mounts from new clients. Rebooting the server back to rc7 everything recovers. The commit b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after checking populated elements") returns the wrong value if the array is already populated which is interpreted as an allocation failure. Dave reported this fixes his problem and it also passed a test running dbench over NFS. Link: https://lkml.kernel.org/r/20210628150219.GC3840@techsingularity.net Fixes: b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after checking populated elements") Signed-off-by: Mel Gorman Reported-by: Dave Jones Tested-by: Dave Jones Cc: Dan Carpenter Cc: Jesper Dangaard Brouer Cc: Vlastimil Babka Cc: [5.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b5c9f5813b9..2bf03c76504b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5058,7 +5058,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Already populated array? */ if (unlikely(page_array && nr_pages - nr_populated == 0)) - return 0; + return nr_populated; /* Use the single page allocator for one page. */ if (nr_pages - nr_populated == 1) -- cgit From d2f07ec052ac1a720d6f1919e3dee7d73f04d495 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:07 -0700 Subject: mm: make __dump_page static Patch series "Constify struct page arguments". While working on various solutions to the 32-bit struct page size regression, one of the problems I found was the networking stack expects to be able to pass const struct page pointers around, and the mm doesn't provide a lot of const-friendly functions to call. The root tangle of problems is that a lot of functions call VM_BUG_ON_PAGE(), which calls dump_page(), which calls a lot of functions which don't take a const struct page (but could be const). This patch (of 6): The only caller of __dump_page() now opencodes dump_page(), so remove it as an externally visible symbol. Link: https://lkml.kernel.org/r/20210416231531.2521383-1-willy@infradead.org Link: https://lkml.kernel.org/r/20210416231531.2521383-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bf03c76504b..4087340fca32 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -658,8 +658,7 @@ static void bad_page(struct page *page, const char *reason) pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - __dump_page(page, reason); - dump_page_owner(page); + dump_page(page, reason); print_modules(); dump_stack(); -- cgit From 691d9497285a90346a67bfee5cac2007e5e18405 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 28 Jun 2021 19:41:10 -0700 Subject: mm/page_alloc: bail out on fatal signal during reclaim/compaction retry attempt A customer experienced a low-memory situation and decided to issue a SIGKILL (i.e. a fatal signal). Instead of promptly terminating as one would expect, the aforementioned task remained unresponsive. Further investigation indicated that the task was "stuck" in the reclaim/compaction retry loop. Now, it does not make sense to retry compaction when a fatal signal is pending. In the context of try_to_compact_pages(), indeed COMPACT_SKIPPED can be returned; albeit, not every zone, on the zone list, would be considered in the case a fatal signal is found to be pending. Yet, in should_compact_retry(), given the last known compaction result, each zone, on the zone list, can be considered/or checked (see compaction_zonelist_suitable()). For example, if a zone was found to succeed, then reclaim/compaction would be tried again (notwithstanding the above). This patch ensures that compaction is not needlessly retried irrespective of the last known compaction result e.g. if it was skipped, in the unlikely case a fatal signal is found pending. So, OOM is at least attempted. Link: https://lkml.kernel.org/r/20210520142901.3371299-1-atomlin@redhat.com Signed-off-by: Aaron Tomlin Reviewed-by: Vlastimil Babka Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4087340fca32..ea1efbb06e40 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4251,6 +4251,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, if (!order) return false; + if (fatal_signal_pending(current)) + return false; + if (compaction_made_progress(compact_result)) (*compaction_retries)++; -- cgit From ca891f41c4c7921a03dfd0fa1faf324393724480 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:22 -0700 Subject: mm: constify get_pfnblock_flags_mask and get_pfnblock_migratetype The struct page is not modified by these routines, so it can be marked const. Link: https://lkml.kernel.org/r/20210416231531.2521383-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea1efbb06e40..4f5eedb6593a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -474,7 +474,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) #endif /* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct page *page, +static inline unsigned long *get_pageblock_bitmap(const struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM @@ -484,7 +484,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page, #endif /* CONFIG_SPARSEMEM */ } -static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) +static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); @@ -495,7 +495,7 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) } static __always_inline -unsigned long __get_pfnblock_flags_mask(struct page *page, +unsigned long __get_pfnblock_flags_mask(const struct page *page, unsigned long pfn, unsigned long mask) { @@ -520,13 +520,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page, * * Return: pageblock_bits flags */ -unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long mask) +unsigned long get_pfnblock_flags_mask(const struct page *page, + unsigned long pfn, unsigned long mask) { return __get_pfnblock_flags_mask(page, pfn, mask); } -static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) +static __always_inline int get_pfnblock_migratetype(const struct page *page, + unsigned long pfn) { return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); } -- cgit From 9660ecaa79ce5c068aa3138ca7e29a9402f284ed Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 28 Jun 2021 19:41:31 -0700 Subject: mm/page_alloc: switch to pr_debug Having such debug messages in the dmesg log may confuse users. Therefore restrict debug output to cases where DEBUG is defined or dynamic debugging is enabled for the respective code piece. Link: https://lkml.kernel.org/r/976adb93-3041-ce63-48fc-55a6096a51c1@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f5eedb6593a..902f889a324d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6770,9 +6770,8 @@ static __meminit void zone_pcp_init(struct zone *zone) zone->pageset_batch = BOOT_PAGESET_BATCH; if (populated_zone(zone)) - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", - zone->name, zone->present_pages, - zone_batchsize(zone)); + pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, + zone->present_pages, zone_batchsize(zone)); } void __meminit init_currently_empty_zone(struct zone *zone, @@ -7042,8 +7041,7 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, pgdat->node_spanned_pages = totalpages; pgdat->node_present_pages = realtotalpages; - printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, - realtotalpages); + pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } #ifndef CONFIG_SPARSEMEM @@ -7243,9 +7241,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat) if (freesize >= memmap_pages) { freesize -= memmap_pages; if (memmap_pages) - printk(KERN_DEBUG - " %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); + pr_debug(" %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); } else pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); @@ -7254,8 +7251,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) /* Account for reserved pages */ if (j == 0 && freesize > dma_reserve) { freesize -= dma_reserve; - printk(KERN_DEBUG " %s zone: %lu pages reserved\n", - zone_names[0], dma_reserve); + pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } if (!is_highmem_idx(j)) -- cgit From 28f836b6777b6f42dce068a40d83a891deaaca37 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:38 -0700 Subject: mm/page_alloc: split per cpu page lists and zone stats The PCP (per-cpu page allocator in page_alloc.c) shares locking requirements with vmstat and the zone lock which is inconvenient and causes some issues. For example, the PCP list and vmstat share the same per-cpu space meaning that it's possible that vmstat updates dirty cache lines holding per-cpu lists across CPUs unless padding is used. Second, PREEMPT_RT does not want to disable IRQs for too long in the page allocator. This series splits the locking requirements and uses locks types more suitable for PREEMPT_RT, reduces the time when special locking is required for stats and reduces the time when IRQs need to be disabled on !PREEMPT_RT kernels. Why local_lock? PREEMPT_RT considers the following sequence to be unsafe as documented in Documentation/locking/locktypes.rst local_irq_disable(); spin_lock(&lock); The pcp allocator has this sequence for rmqueue_pcplist (local_irq_save) -> __rmqueue_pcplist -> rmqueue_bulk (spin_lock). While it's possible to separate this out, it generally means there are points where we enable IRQs and reenable them again immediately. To prevent a migration and the per-cpu pointer going stale, migrate_disable is also needed. That is a custom lock that is similar, but worse, than local_lock. Furthermore, on PREEMPT_RT, it's undesirable to leave IRQs disabled for too long. By converting to local_lock which disables migration on PREEMPT_RT, the locking requirements can be separated and start moving the protections for PCP, stats and the zone lock to PREEMPT_RT-safe equivalent locking. As a bonus, local_lock also means that PROVE_LOCKING does something useful. After that, it's obvious that zone_statistics incurs too much overhead and leaves IRQs disabled for longer than necessary on !PREEMPT_RT kernels. zone_statistics uses perfectly accurate counters requiring IRQs be disabled for parallel RMW sequences when inaccurate ones like vm_events would do. The series makes the NUMA statistics (NUMA_HIT and friends) inaccurate counters that then require no special protection on !PREEMPT_RT. The bulk page allocator can then do stat updates in bulk with IRQs enabled which should improve the efficiency. Technically, this could have been done without the local_lock and vmstat conversion work and the order simply reflects the timing of when different series were implemented. Finally, there are places where we conflate IRQs being disabled for the PCP with the IRQ-safe zone spinlock. The remainder of the series reduces the scope of what is protected by disabled IRQs on !PREEMPT_RT kernels. By the end of the series, page_alloc.c does not call local_irq_save so the locking scope is a bit clearer. The one exception is that modifying NR_FREE_PAGES still happens in places where it's known the IRQs are disabled as it's harmless for PREEMPT_RT and would be expensive to split the locking there. No performance data is included because despite the overhead of the stats, it's within the noise for most workloads on !PREEMPT_RT. However, Jesper Dangaard Brouer ran a page allocation microbenchmark on a E5-1650 v4 @ 3.60GHz CPU on the first version of this series. Focusing on the array variant of the bulk page allocator reveals the following. (CPU: Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz) ARRAY variant: time_bulk_page_alloc_free_array: step=bulk size Baseline Patched 1 56.383 54.225 (+3.83%) 2 40.047 35.492 (+11.38%) 3 37.339 32.643 (+12.58%) 4 35.578 30.992 (+12.89%) 8 33.592 29.606 (+11.87%) 16 32.362 28.532 (+11.85%) 32 31.476 27.728 (+11.91%) 64 30.633 27.252 (+11.04%) 128 30.596 27.090 (+11.46%) While this is a positive outcome, the series is more likely to be interesting to the RT people in terms of getting parts of the PREEMPT_RT tree into mainline. This patch (of 9): The per-cpu page allocator lists and the per-cpu vmstat deltas are stored in the same struct per_cpu_pages even though vmstats have no direct impact on the per-cpu page lists. This is inconsistent because the vmstats for a node are stored on a dedicated structure. The bigger issue is that the per_cpu_pages structure is not cache-aligned and stat updates either cache conflict with adjacent per-cpu lists incurring a runtime cost or padding is required incurring a memory cost. This patch splits the per-cpu pagelists and the vmstat deltas into separate structures. It's mostly a mechanical conversion but some variable renaming is done to clearly distinguish the per-cpu pages structure (pcp) from the vmstats (pzstats). Superficially, this appears to increase the size of the per_cpu_pages structure but the movement of expire fills a structure hole so there is no impact overall. [mgorman@techsingularity.net: make it W=1 cleaner] Link: https://lkml.kernel.org/r/20210514144622.GA3735@techsingularity.net [mgorman@techsingularity.net: make it W=1 even cleaner] Link: https://lkml.kernel.org/r/20210516140705.GB3735@techsingularity.net [lkp@intel.com: check struct per_cpu_zonestat has a non-zero size] [vbabka@suse.cz: Init zone->per_cpu_zonestats properly] Link: https://lkml.kernel.org/r/20210512095458.30632-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210512095458.30632-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Jesper Dangaard Brouer Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Ingo Molnar Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 85 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 38 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 902f889a324d..330c7307a92b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3026,15 +3026,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { unsigned long flags; - struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; local_irq_save(flags); - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); + local_irq_restore(flags); } @@ -3133,7 +3132,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) * disables preemption as part of its processing */ for_each_online_cpu(cpu) { - struct per_cpu_pageset *pcp; + struct per_cpu_pages *pcp; struct zone *z; bool has_pcps = false; @@ -3144,13 +3143,13 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) */ has_pcps = true; } else if (zone) { - pcp = per_cpu_ptr(zone->pageset, cpu); - if (pcp->pcp.count) + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + if (pcp->count) has_pcps = true; } else { for_each_populated_zone(z) { - pcp = per_cpu_ptr(z->pageset, cpu); - if (pcp->pcp.count) { + pcp = per_cpu_ptr(z->per_cpu_pageset, cpu); + if (pcp->count) { has_pcps = true; break; } @@ -3280,7 +3279,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) migratetype = MIGRATE_MOVABLE; } - pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= READ_ONCE(pcp->high)) @@ -3496,7 +3495,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, unsigned long flags; local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp = this_cpu_ptr(zone->per_cpu_pageset); list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); if (page) { @@ -5105,7 +5104,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Attempt the batch allocation */ local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_list = &pcp->lists[ac.migratetype]; while (nr_populated < nr_pages) { @@ -5720,7 +5719,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) continue; for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" @@ -5812,7 +5811,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) free_pcp = 0; for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; show_node(zone); printk(KERN_CONT @@ -5853,7 +5852,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), - K(this_cpu_read(zone->pageset->pcp.count)), + K(this_cpu_read(zone->per_cpu_pageset->count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -6180,11 +6179,12 @@ static void build_zonelists(pg_data_t *pgdat) * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ -static void pageset_init(struct per_cpu_pageset *p); +static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats); /* These effectively disable the pcplists in the boot pageset completely */ #define BOOT_PAGESET_HIGH 0 #define BOOT_PAGESET_BATCH 1 -static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) @@ -6251,7 +6251,7 @@ build_all_zonelists_init(void) * (a chicken-egg dilemma). */ for_each_possible_cpu(cpu) - pageset_init(&per_cpu(boot_pageset, cpu)); + per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu)); mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); @@ -6650,14 +6650,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, WRITE_ONCE(pcp->high, high); } -static void pageset_init(struct per_cpu_pageset *p) +static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) { - struct per_cpu_pages *pcp; int migratetype; - memset(p, 0, sizeof(*p)); + memset(pcp, 0, sizeof(*pcp)); + memset(pzstats, 0, sizeof(*pzstats)); - pcp = &p->pcp; for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); @@ -6674,12 +6673,12 @@ static void pageset_init(struct per_cpu_pageset *p) static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, unsigned long batch) { - struct per_cpu_pageset *p; + struct per_cpu_pages *pcp; int cpu; for_each_possible_cpu(cpu) { - p = per_cpu_ptr(zone->pageset, cpu); - pageset_update(&p->pcp, high, batch); + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + pageset_update(pcp, high, batch); } } @@ -6714,13 +6713,20 @@ static void zone_set_pageset_high_and_batch(struct zone *zone) void __meminit setup_zone_pageset(struct zone *zone) { - struct per_cpu_pageset *p; int cpu; - zone->pageset = alloc_percpu(struct per_cpu_pageset); + /* Size may be 0 on !SMP && !NUMA */ + if (sizeof(struct per_cpu_zonestat) > 0) + zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat); + + zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages); for_each_possible_cpu(cpu) { - p = per_cpu_ptr(zone->pageset, cpu); - pageset_init(p); + struct per_cpu_pages *pcp; + struct per_cpu_zonestat *pzstats; + + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + per_cpu_pages_init(pcp, pzstats); } zone_set_pageset_high_and_batch(zone); @@ -6747,9 +6753,9 @@ void __init setup_per_cpu_pageset(void) * the nodes these zones are associated with. */ for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); - memset(pcp->vm_numa_stat_diff, 0, - sizeof(pcp->vm_numa_stat_diff)); + struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); + memset(pzstats->vm_numa_stat_diff, 0, + sizeof(pzstats->vm_numa_stat_diff)); } #endif @@ -6765,7 +6771,8 @@ static __meminit void zone_pcp_init(struct zone *zone) * relies on the ability of the linker to provide the * offset of a (static) per cpu variable into the per cpu area. */ - zone->pageset = &boot_pageset; + zone->per_cpu_pageset = &boot_pageset; + zone->per_cpu_zonestats = &boot_zonestats; zone->pageset_high = BOOT_PAGESET_HIGH; zone->pageset_batch = BOOT_PAGESET_BATCH; @@ -9046,15 +9053,17 @@ void zone_pcp_enable(struct zone *zone) void zone_pcp_reset(struct zone *zone) { int cpu; - struct per_cpu_pageset *pset; + struct per_cpu_zonestat *pzstats; - if (zone->pageset != &boot_pageset) { + if (zone->per_cpu_pageset != &boot_pageset) { for_each_online_cpu(cpu) { - pset = per_cpu_ptr(zone->pageset, cpu); - drain_zonestat(zone, pset); + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + drain_zonestat(zone, pzstats); } - free_percpu(zone->pageset); - zone->pageset = &boot_pageset; + free_percpu(zone->per_cpu_pageset); + free_percpu(zone->per_cpu_zonestats); + zone->per_cpu_pageset = &boot_pageset; + zone->per_cpu_zonestats = &boot_zonestats; } } -- cgit From dbbee9d5cd83f9d0a29639e260516907ceb2ac3d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:41 -0700 Subject: mm/page_alloc: convert per-cpu list protection to local_lock There is a lack of clarity of what exactly local_irq_save/local_irq_restore protects in page_alloc.c . It conflates the protection of per-cpu page allocation structures with per-cpu vmstat deltas. This patch protects the PCP structure using local_lock which for most configurations is identical to IRQ enabling/disabling. The scope of the lock is still wider than it should be but this is decreased later. It is possible for the local_lock to be embedded safely within struct per_cpu_pages but it adds complexity to free_unref_page_list. [akpm@linux-foundation.org: coding style fixes] [mgorman@techsingularity.net: work around a pahole limitation with zero-sized struct pagesets] Link: https://lkml.kernel.org/r/20210526080741.GW30378@techsingularity.net [lkp@intel.com: Make pagesets static] Link: https://lkml.kernel.org/r/20210512095458.30632-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 61 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 15 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 330c7307a92b..89872ad5e872 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -122,6 +122,24 @@ typedef int __bitwise fpi_t; static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) +struct pagesets { + local_lock_t lock; +#if defined(CONFIG_DEBUG_INFO_BTF) && \ + !defined(CONFIG_DEBUG_LOCK_ALLOC) && \ + !defined(CONFIG_PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT) + /* + * pahole 1.21 and earlier gets confused by zero-sized per-CPU + * variables and produces invalid BTF. Ensure that + * sizeof(struct pagesets) != 0 for older versions of pahole. + */ + char __pahole_hack; + #warning "pahole too old to support zero-sized struct pagesets" +#endif +}; +static DEFINE_PER_CPU(struct pagesets, pagesets) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -1453,6 +1471,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, } while (--count && --batch_free && !list_empty(list)); } + /* + * local_lock_irq held so equivalent to spin_lock_irqsave for + * both PREEMPT_RT and non-PREEMPT_RT configurations. + */ spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); @@ -1573,6 +1595,11 @@ static void __free_pages_ok(struct page *page, unsigned int order, return; migratetype = get_pfnblock_migratetype(page, pfn); + + /* + * TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock + * and protect vmstat updates. + */ local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, pfn, order, migratetype, @@ -2955,6 +2982,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, { int i, allocated = 0; + /* + * local_lock_irq held so equivalent to spin_lock_irqsave for + * both PREEMPT_RT and non-PREEMPT_RT configurations. + */ spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, @@ -3007,12 +3038,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) unsigned long flags; int to_drain, batch; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) free_pcppages_bulk(zone, to_drain, pcp); - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } #endif @@ -3028,13 +3059,13 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) unsigned long flags; struct per_cpu_pages *pcp; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3297,9 +3328,9 @@ void free_unref_page(struct page *page) if (!free_unref_page_prepare(page, pfn)) return; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); free_unref_page_commit(page, pfn); - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3319,7 +3350,7 @@ void free_unref_page_list(struct list_head *list) set_page_private(page, pfn); } - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { unsigned long pfn = page_private(page); @@ -3332,12 +3363,12 @@ void free_unref_page_list(struct list_head *list) * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); batch_count = 0; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); } } - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3494,7 +3525,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct page *page; unsigned long flags; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); @@ -3502,7 +3533,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); } - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); return page; } @@ -5103,7 +5134,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto failed; /* Attempt the batch allocation */ - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_list = &pcp->lists[ac.migratetype]; @@ -5141,12 +5172,12 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); return nr_populated; failed_irq: - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); failed: page = __alloc_pages(gfp, 0, preferred_nid, nodemask); -- cgit From f19298b9516c1a031b34b4147773457e3efe743b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:44 -0700 Subject: mm/vmstat: convert NUMA statistics to basic NUMA counters NUMA statistics are maintained on the zone level for hits, misses, foreign etc but nothing relies on them being perfectly accurate for functional correctness. The counters are used by userspace to get a general overview of a workloads NUMA behaviour but the page allocator incurs a high cost to maintain perfect accuracy similar to what is required for a vmstat like NR_FREE_PAGES. There even is a sysctl vm.numa_stat to allow userspace to turn off the collection of NUMA statistics like NUMA_HIT. This patch converts NUMA_HIT and friends to be NUMA events with similar accuracy to VM events. There is a possibility that slight errors will be introduced but the overall trend as seen by userspace will be similar. The counters are no longer updated from vmstat_refresh context as it is unnecessary overhead for counters that may never be read by userspace. Note that counters could be maintained at the node level to save space but it would have a user-visible impact due to /proc/zoneinfo. [lkp@intel.com: Fix misplaced closing brace for !CONFIG_NUMA] Link: https://lkml.kernel.org/r/20210512095458.30632-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89872ad5e872..4e03109bdae5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3480,12 +3480,12 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) local_stat = NUMA_OTHER; if (zone_to_nid(z) == zone_to_nid(preferred_zone)) - __inc_numa_state(z, NUMA_HIT); + __count_numa_event(z, NUMA_HIT); else { - __inc_numa_state(z, NUMA_MISS); - __inc_numa_state(preferred_zone, NUMA_FOREIGN); + __count_numa_event(z, NUMA_MISS); + __count_numa_event(preferred_zone, NUMA_FOREIGN); } - __inc_numa_state(z, local_stat); + __count_numa_event(z, local_stat); #endif } @@ -6785,8 +6785,8 @@ void __init setup_per_cpu_pageset(void) */ for_each_possible_cpu(cpu) { struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); - memset(pzstats->vm_numa_stat_diff, 0, - sizeof(pzstats->vm_numa_stat_diff)); + memset(pzstats->vm_numa_event, 0, + sizeof(pzstats->vm_numa_event)); } #endif -- cgit From 3e23060b2d0b7eebf37b3b6043ea68da0ebc0646 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:50 -0700 Subject: mm/page_alloc: batch the accounting updates in the bulk allocator Now that the zone_statistics are simple counters that do not require special protection, the bulk allocator accounting updates can be batch updated without adding too much complexity with protected RMW updates or using xchg. Link: https://lkml.kernel.org/r/20210512095458.30632-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e03109bdae5..6bb9b87cf7d5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3467,7 +3467,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + long nr_account) { #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; @@ -3480,12 +3481,12 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) local_stat = NUMA_OTHER; if (zone_to_nid(z) == zone_to_nid(preferred_zone)) - __count_numa_event(z, NUMA_HIT); + __count_numa_events(z, NUMA_HIT, nr_account); else { - __count_numa_event(z, NUMA_MISS); - __count_numa_event(preferred_zone, NUMA_FOREIGN); + __count_numa_events(z, NUMA_MISS, nr_account); + __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account); } - __count_numa_event(z, local_stat); + __count_numa_events(z, local_stat, nr_account); #endif } @@ -3531,7 +3532,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); - zone_statistics(preferred_zone, zone); + zone_statistics(preferred_zone, zone, 1); } local_unlock_irqrestore(&pagesets.lock, flags); return page; @@ -3592,7 +3593,7 @@ struct page *rmqueue(struct zone *preferred_zone, get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + zone_statistics(preferred_zone, zone, 1); local_irq_restore(flags); out: @@ -5077,7 +5078,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct alloc_context ac; gfp_t alloc_gfp; unsigned int alloc_flags = ALLOC_WMARK_LOW; - int nr_populated = 0; + int nr_populated = 0, nr_account = 0; if (unlikely(nr_pages <= 0)) return 0; @@ -5154,15 +5155,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto failed_irq; break; } - - /* - * Ideally this would be batched but the best way to do - * that cheaply is to first convert zone_statistics to - * be inaccurate per-cpu counter like vm_events to avoid - * a RMW cycle then do the accounting with IRQs enabled. - */ - __count_zid_vm_events(PGALLOC, zone_idx(zone), 1); - zone_statistics(ac.preferred_zoneref->zone, zone); + nr_account++; prep_new_page(page, 0, gfp, 0); if (page_list) @@ -5172,6 +5165,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } + __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); + zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); + local_unlock_irqrestore(&pagesets.lock, flags); return nr_populated; -- cgit From 43c95bcc51e4e7f3e3cbce01515fe429a4cf12a7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:54 -0700 Subject: mm/page_alloc: reduce duration that IRQs are disabled for VM counters IRQs are left disabled for the zone and node VM event counters. This is unnecessary as the affected counters are allowed to race for preemmption and IRQs. This patch reduces the scope of IRQs being disabled via local_[lock|unlock]_irq on !PREEMPT_RT kernels. One __mod_zone_freepage_state is still called with IRQs disabled. While this could be moved out, it's not free on all architectures as some require IRQs to be disabled for mod_zone_page_state on !PREEMPT_RT kernels. Link: https://lkml.kernel.org/r/20210512095458.30632-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6bb9b87cf7d5..161bcda61520 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3530,11 +3530,11 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp = this_cpu_ptr(zone->per_cpu_pageset); list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); + local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone, 1); } - local_unlock_irqrestore(&pagesets.lock, flags); return page; } @@ -3586,15 +3586,15 @@ struct page *rmqueue(struct zone *preferred_zone, if (!page) page = __rmqueue(zone, order, migratetype, alloc_flags); } while (page && check_new_pages(page, order)); - spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); - local_irq_restore(flags); out: /* Separate test+clear to avoid unnecessary atomics */ @@ -3607,7 +3607,7 @@ out: return page; failed: - local_irq_restore(flags); + spin_unlock_irqrestore(&zone->lock, flags); return NULL; } @@ -5165,11 +5165,11 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } + local_unlock_irqrestore(&pagesets.lock, flags); + __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); - local_unlock_irqrestore(&pagesets.lock, flags); - return nr_populated; failed_irq: -- cgit From 56f0e661ea8c0178e80048df7166653a51ef2c3d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:57 -0700 Subject: mm/page_alloc: explicitly acquire the zone lock in __free_pages_ok __free_pages_ok() disables IRQs before calling a common helper free_one_page() that acquires the zone lock. This is not safe according to Documentation/locking/locktypes.rst and in this context, IRQ disabling is not protecting a per_cpu_pages structure either or a local_lock would be used. This patch explicitly acquires the lock with spin_lock_irqsave instead of relying on a helper. This removes the last instance of local_irq_save() in page_alloc.c. Link: https://lkml.kernel.org/r/20210512095458.30632-8-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 161bcda61520..f1a51c163e75 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1590,21 +1590,21 @@ static void __free_pages_ok(struct page *page, unsigned int order, unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); + struct zone *zone = page_zone(page); if (!free_pages_prepare(page, order, true, fpi_flags)) return; migratetype = get_pfnblock_migratetype(page, pfn); - /* - * TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock - * and protect vmstat updates. - */ - local_irq_save(flags); + spin_lock_irqsave(&zone->lock, flags); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, pfn, order, migratetype, - fpi_flags); - local_irq_restore(flags); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + spin_unlock_irqrestore(&zone->lock, flags); } void __free_pages_core(struct page *page, unsigned int order) -- cgit From df1acc856923c0a65c28b588585449106c316b71 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:00 -0700 Subject: mm/page_alloc: avoid conflating IRQs disabled with zone->lock Historically when freeing pages, free_one_page() assumed that callers had IRQs disabled and the zone->lock could be acquired with spin_lock(). This confuses the scope of what local_lock_irq is protecting and what zone->lock is protecting in free_unref_page_list in particular. This patch uses spin_lock_irqsave() for the zone->lock in free_one_page() instead of relying on callers to have disabled IRQs. free_unref_page_commit() is changed to only deal with PCP pages protected by the local lock. free_unref_page_list() then first frees isolated pages to the buddy lists with free_one_page() and frees the rest of the pages to the PCP via free_unref_page_commit(). The end result is that free_one_page() is no longer depending on side-effects of local_lock to be correct. Note that this may incur a performance penalty while memory hot-remove is running but that is not a common operation. [lkp@intel.com: Ensure CMA pages get addded to correct pcp list] Link: https://lkml.kernel.org/r/20210512095458.30632-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 75 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 26 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f1a51c163e75..dd367e5df8cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1501,13 +1501,15 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype, fpi_t fpi_flags) { - spin_lock(&zone->lock); + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -3285,31 +3287,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) return true; } -static void free_unref_page_commit(struct page *page, unsigned long pfn) +static void free_unref_page_commit(struct page *page, unsigned long pfn, + int migratetype) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; - int migratetype; - migratetype = get_pcppage_migratetype(page); __count_vm_event(PGFREE); - - /* - * We only track unmovable, reclaimable and movable on pcp lists. - * Free ISOLATE pages back to the allocator because they are being - * offlined but treat HIGHATOMIC as movable pages so we can get those - * areas back if necessary. Otherwise, we may have to free - * excessively into the page allocator - */ - if (migratetype >= MIGRATE_PCPTYPES) { - if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, 0, migratetype, - FPI_NONE); - return; - } - migratetype = MIGRATE_MOVABLE; - } - pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; @@ -3324,12 +3308,29 @@ void free_unref_page(struct page *page) { unsigned long flags; unsigned long pfn = page_to_pfn(page); + int migratetype; if (!free_unref_page_prepare(page, pfn)) return; + /* + * We only track unmovable, reclaimable and movable on pcp lists. + * Place ISOLATE pages on the isolated list because they are being + * offlined but treat HIGHATOMIC as movable pages so we can get those + * areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ + migratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { + free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + return; + } + migratetype = MIGRATE_MOVABLE; + } + local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn); + free_unref_page_commit(page, pfn, migratetype); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3341,22 +3342,44 @@ void free_unref_page_list(struct list_head *list) struct page *page, *next; unsigned long flags, pfn; int batch_count = 0; + int migratetype; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); if (!free_unref_page_prepare(page, pfn)) list_del(&page->lru); + + /* + * Free isolated pages directly to the allocator, see + * comment in free_unref_page. + */ + migratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { + list_del(&page->lru); + free_one_page(page_zone(page), page, pfn, 0, + migratetype, FPI_NONE); + continue; + } + + /* + * Non-isolated types over MIGRATE_PCPTYPES get added + * to the MIGRATE_MOVABLE pcp list. + */ + set_pcppage_migratetype(page, MIGRATE_MOVABLE); + } + set_page_private(page, pfn); } local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { - unsigned long pfn = page_private(page); - + pfn = page_private(page); set_page_private(page, 0); + migratetype = get_pcppage_migratetype(page); trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn); + free_unref_page_commit(page, pfn, migratetype); /* * Guard against excessive IRQ disabled times when we get -- cgit From 902499937e3a82156dcb5069b6df27640480e204 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:03 -0700 Subject: mm/page_alloc: update PGFREE outside the zone lock in __free_pages_ok VM events do not need explicit protection by disabling IRQs so update the counter with IRQs enabled in __free_pages_ok. Link: https://lkml.kernel.org/r/20210512095458.30632-10-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd367e5df8cb..37ce0c2f3bae 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1600,13 +1600,14 @@ static void __free_pages_ok(struct page *page, unsigned int order, migratetype = get_pfnblock_migratetype(page, pfn); spin_lock_irqsave(&zone->lock, flags); - __count_vm_events(PGFREE, 1 << order); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); + + __count_vm_events(PGFREE, 1 << order); } void __free_pages_core(struct page *page, unsigned int order) -- cgit From 151e084af4946344fe0d021f4110b69edaac1e8d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 28 Jun 2021 19:42:06 -0700 Subject: mm: page_alloc: dump migrate-failed pages only at -EBUSY alloc_contig_dump_pages() aims for helping debugging page migration failure by elevated page refcount compared to expected_count. (for the detail, please look at migrate_page_move_mapping) However, -ENOMEM is just the case that system is under memory pressure state, not relevant with page refcount at all. Thus, the dumping page list is not helpful for the debugging point of view. Link: https://lkml.kernel.org/r/YKa2Wyo9xqIErpfa@google.com Signed-off-by: Minchan Kim Reviewed-by: David Hildenbrand Cc: Suren Baghdasaryan Cc: John Dias Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37ce0c2f3bae..941a75b9fb5a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8800,7 +8800,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, lru_cache_enable(); if (ret < 0) { - alloc_contig_dump_pages(&cc->migratepages); + if (ret == -EBUSY) + alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); return ret; } -- cgit From bbbecb35a41cb5c63ef78e14cc8b95fa9130bc1a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:09 -0700 Subject: mm/page_alloc: delete vm.percpu_pagelist_fraction Patch series "Calculate pcp->high based on zone sizes and active CPUs", v2. The per-cpu page allocator (PCP) is meant to reduce contention on the zone lock but the sizing of batch and high is archaic and neither takes the zone size into account or the number of CPUs local to a zone. With larger zones and more CPUs per node, the contention is getting worse. Furthermore, the fact that vm.percpu_pagelist_fraction adjusts both batch and high values means that the sysctl can reduce zone lock contention but also increase allocation latencies. This series disassociates pcp->high from pcp->batch and then scales pcp->high based on the size of the local zone with limited impact to reclaim and accounting for active CPUs but leaves pcp->batch static. It also adapts the number of pages that can be on the pcp list based on recent freeing patterns. The motivation is partially to adjust to larger memory sizes but is also driven by the fact that large batches of page freeing via release_pages() often shows zone contention as a major part of the problem. Another is a bug report based on an older kernel where a multi-terabyte process can takes several minutes to exit. A workaround was to use vm.percpu_pagelist_fraction to increase the pcp->high value but testing indicated that a production workload could not use the same values because of an increase in allocation latencies. Unfortunately, I cannot reproduce this test case myself as the multi-terabyte machines are in active use but it should alleviate the problem. The series aims to address both and partially acts as a pre-requisite. pcp only works with order-0 which is useless for SLUB (when using high orders) and THP (unconditionally). To store high-order pages on PCP, the pcp->high values need to be increased first. This patch (of 6): The vm.percpu_pagelist_fraction is used to increase the batch and high limits for the per-cpu page allocator (PCP). The intent behind the sysctl is to reduce zone lock acquisition when allocating/freeing pages but it has a problem. While it can decrease contention, it can also increase latency on the allocation side due to unreasonably large batch sizes. This leads to games where an administrator adjusts percpu_pagelist_fraction on the fly to work around contention and allocation latency problems. This series aims to alleviate the problems with zone lock contention while avoiding the allocation-side latency problems. For the purposes of review, it's easier to remove this sysctl now and reintroduce a similar sysctl later in the series that deals only with pcp->high. Link: https://lkml.kernel.org/r/20210525080119.5455-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Dave Hansen Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 55 ++++--------------------------------------------------- 1 file changed, 4 insertions(+), 51 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 941a75b9fb5a..5abf2c1d4c58 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -120,7 +120,6 @@ typedef int __bitwise fpi_t; /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); -#define MIN_PERCPU_PAGELIST_FRACTION (8) struct pagesets { local_lock_t lock; @@ -193,7 +192,6 @@ EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; -int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); EXPORT_SYMBOL(init_on_alloc); @@ -6735,22 +6733,15 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h /* * Calculate and set new high and batch values for all per-cpu pagesets of a - * zone, based on the zone's size and the percpu_pagelist_fraction sysctl. + * zone based on the zone's size. */ static void zone_set_pageset_high_and_batch(struct zone *zone) { unsigned long new_high, new_batch; - if (percpu_pagelist_fraction) { - new_high = zone_managed_pages(zone) / percpu_pagelist_fraction; - new_batch = max(1UL, new_high / 4); - if ((new_high / 4) > (PAGE_SHIFT * 8)) - new_batch = PAGE_SHIFT * 8; - } else { - new_batch = zone_batchsize(zone); - new_high = 6 * new_batch; - new_batch = max(1UL, 1 * new_batch); - } + new_batch = zone_batchsize(zone); + new_high = 6 * new_batch; + new_batch = max(1UL, 1 * new_batch); if (zone->pageset_high == new_high && zone->pageset_batch == new_batch) @@ -8413,44 +8404,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } -/* - * percpu_pagelist_fraction - changes the pcp->high for each zone on each - * cpu. It is the fraction of total pages in each zone that a hot per cpu - * pagelist can have before it gets flushed back to buddy allocator. - */ -int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - struct zone *zone; - int old_percpu_pagelist_fraction; - int ret; - - mutex_lock(&pcp_batch_high_lock); - old_percpu_pagelist_fraction = percpu_pagelist_fraction; - - ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || ret < 0) - goto out; - - /* Sanity checking to avoid pcp imbalance */ - if (percpu_pagelist_fraction && - percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { - percpu_pagelist_fraction = old_percpu_pagelist_fraction; - ret = -EINVAL; - goto out; - } - - /* No change? */ - if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) - goto out; - - for_each_populated_zone(zone) - zone_set_pageset_high_and_batch(zone); -out: - mutex_unlock(&pcp_batch_high_lock); - return ret; -} - #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but -- cgit From b92ca18e8ca596f4f3d80c1fe833bc57a1b2458c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:12 -0700 Subject: mm/page_alloc: disassociate the pcp->high from pcp->batch The pcp high watermark is based on the batch size but there is no relationship between them other than it is convenient to use early in boot. This patch takes the first step and bases pcp->high on the zone low watermark split across the number of CPUs local to a zone while the batch size remains the same to avoid increasing allocation latencies. The intent behind the default pcp->high is "set the number of PCP pages such that if they are all full that background reclaim is not started prematurely". Note that in this patch the pcp->high values are adjusted after memory hotplug events, min_free_kbytes adjustments and watermark scale factor adjustments but not CPU hotplug events which is handled later in the series. On a test KVM instance; Before grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 378 batch: 63 After grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 649 batch: 63 [mgorman@techsingularity.net: fix __setup_per_zone_wmarks for parallel memory hotplug] Link: https://lkml.kernel.org/r/20210528105925.GN30378@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 62 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 18 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5abf2c1d4c58..19ec81d403a0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2174,14 +2174,6 @@ void __init page_alloc_init_late(void) /* Block until all are initialised */ wait_for_completion(&pgdat_init_all_done_comp); - /* - * The number of managed pages has changed due to the initialisation - * so the pcpu batch and high limits needs to be updated or the limits - * will be artificially small. - */ - for_each_populated_zone(zone) - zone_pcp_update(zone); - /* * We initialized the rest of the deferred pages. Permanently disable * on-demand struct page initialization. @@ -6633,13 +6625,12 @@ static int zone_batchsize(struct zone *zone) int batch; /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. + * The number of pages to batch allocate is either ~0.1% + * of the zone or 1MB, whichever is smaller. The batch + * size is striking a balance between allocation latency + * and zone lock contention. */ - batch = zone_managed_pages(zone) / 1024; - /* But no more than a meg. */ - if (batch * PAGE_SIZE > 1024 * 1024) - batch = (1024 * 1024) / PAGE_SIZE; + batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -6676,6 +6667,34 @@ static int zone_batchsize(struct zone *zone) #endif } +static int zone_highsize(struct zone *zone, int batch) +{ +#ifdef CONFIG_MMU + int high; + int nr_local_cpus; + + /* + * The high value of the pcp is based on the zone low watermark + * so that if they are full then background reclaim will not be + * started prematurely. The value is split across all online CPUs + * local to the zone. Note that early in boot that CPUs may not be + * online yet. + */ + nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))); + high = low_wmark_pages(zone) / nr_local_cpus; + + /* + * Ensure high is at least batch*4. The multiple is based on the + * historical relationship between high and batch. + */ + high = max(high, batch << 2); + + return high; +#else + return 0; +#endif +} + /* * pcp->high and pcp->batch values are related and generally batch is lower * than high. They are also related to pcp->count such that count is lower @@ -6737,11 +6756,10 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h */ static void zone_set_pageset_high_and_batch(struct zone *zone) { - unsigned long new_high, new_batch; + int new_high, new_batch; - new_batch = zone_batchsize(zone); - new_high = 6 * new_batch; - new_batch = max(1UL, 1 * new_batch); + new_batch = max(1, zone_batchsize(zone)); + new_high = zone_highsize(zone, new_batch); if (zone->pageset_high == new_high && zone->pageset_batch == new_batch) @@ -8222,11 +8240,19 @@ static void __setup_per_zone_wmarks(void) */ void setup_per_zone_wmarks(void) { + struct zone *zone; static DEFINE_SPINLOCK(lock); spin_lock(&lock); __setup_per_zone_wmarks(); spin_unlock(&lock); + + /* + * The watermark size have changed so update the pcpu batch + * and high limits or the limits may be inappropriate. + */ + for_each_zone(zone) + zone_pcp_update(zone); } /* -- cgit From 04f8cfeaed0849e702278378bce3867577ca45fb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:15 -0700 Subject: mm/page_alloc: adjust pcp->high after CPU hotplug events The PCP high watermark is based on the number of online CPUs so the watermarks must be adjusted during CPU hotplug. At the time of hot-remove, the number of online CPUs is already adjusted but during hot-add, a delta needs to be applied to update PCP to the correct value. After this patch is applied, the high watermarks are adjusted correctly. # grep high: /proc/zoneinfo | tail -1 high: 649 # echo 0 > /sys/devices/system/cpu/cpu4/online # grep high: /proc/zoneinfo | tail -1 high: 664 # echo 1 > /sys/devices/system/cpu/cpu4/online # grep high: /proc/zoneinfo | tail -1 high: 649 Link: https://lkml.kernel.org/r/20210525080119.5455-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 19ec81d403a0..8d196a803820 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6667,7 +6667,7 @@ static int zone_batchsize(struct zone *zone) #endif } -static int zone_highsize(struct zone *zone, int batch) +static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU int high; @@ -6678,9 +6678,10 @@ static int zone_highsize(struct zone *zone, int batch) * so that if they are full then background reclaim will not be * started prematurely. The value is split across all online CPUs * local to the zone. Note that early in boot that CPUs may not be - * online yet. + * online yet and that during CPU hotplug that the cpumask is not + * yet updated when a CPU is being onlined. */ - nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))); + nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online; high = low_wmark_pages(zone) / nr_local_cpus; /* @@ -6754,12 +6755,12 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h * Calculate and set new high and batch values for all per-cpu pagesets of a * zone based on the zone's size. */ -static void zone_set_pageset_high_and_batch(struct zone *zone) +static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) { int new_high, new_batch; new_batch = max(1, zone_batchsize(zone)); - new_high = zone_highsize(zone, new_batch); + new_high = zone_highsize(zone, new_batch, cpu_online); if (zone->pageset_high == new_high && zone->pageset_batch == new_batch) @@ -6789,7 +6790,7 @@ void __meminit setup_zone_pageset(struct zone *zone) per_cpu_pages_init(pcp, pzstats); } - zone_set_pageset_high_and_batch(zone); + zone_set_pageset_high_and_batch(zone, 0); } /* @@ -8044,6 +8045,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) static int page_alloc_cpu_dead(unsigned int cpu) { + struct zone *zone; lru_add_drain_cpu(cpu); drain_pages(cpu); @@ -8064,6 +8066,19 @@ static int page_alloc_cpu_dead(unsigned int cpu) * race with what we are doing. */ cpu_vm_stats_fold(cpu); + + for_each_populated_zone(zone) + zone_pcp_update(zone, 0); + + return 0; +} + +static int page_alloc_cpu_online(unsigned int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zone_pcp_update(zone, 1); return 0; } @@ -8089,8 +8104,9 @@ void __init page_alloc_init(void) hashdist = 0; #endif - ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, - "mm/page_alloc:dead", NULL, + ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC, + "mm/page_alloc:pcp", + page_alloc_cpu_online, page_alloc_cpu_dead); WARN_ON(ret < 0); } @@ -8252,7 +8268,7 @@ void setup_per_zone_wmarks(void) * and high limits or the limits may be inappropriate. */ for_each_zone(zone) - zone_pcp_update(zone); + zone_pcp_update(zone, 0); } /* @@ -9053,10 +9069,10 @@ EXPORT_SYMBOL(free_contig_range); * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalculated. */ -void __meminit zone_pcp_update(struct zone *zone) +void zone_pcp_update(struct zone *zone, int cpu_online) { mutex_lock(&pcp_batch_high_lock); - zone_set_pageset_high_and_batch(zone); + zone_set_pageset_high_and_batch(zone, cpu_online); mutex_unlock(&pcp_batch_high_lock); } -- cgit From 3b12e7e97938424de2bb1b95ba0bd6a49bad39f9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:18 -0700 Subject: mm/page_alloc: scale the number of pages that are batch freed When a task is freeing a large number of order-0 pages, it may acquire the zone->lock multiple times freeing pages in batches. This may unnecessarily contend on the zone lock when freeing very large number of pages. This patch adapts the size of the batch based on the recent pattern to scale the batch size for subsequent frees. As the machines I used were not large enough to test this are not large enough to illustrate a problem, a debugging patch shows patterns like the following (slightly editted for clarity) Baseline vanilla kernel time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 With patches time-unmap-7724 [...] free_pcppages_bulk: free 126 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 252 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 504 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 751 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 751 count 814 high 814 Link: https://lkml.kernel.org/r/20210525080119.5455-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Dave Hansen Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d196a803820..e1d1825a2611 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3278,18 +3278,47 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) return true; } +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) +{ + int min_nr_free, max_nr_free; + + /* Check for PCP disabled or boot pageset */ + if (unlikely(high < batch)) + return 1; + + /* Leave at least pcp->batch pages on the list */ + min_nr_free = batch; + max_nr_free = high - batch; + + /* + * Double the number of pages freed each time there is subsequent + * freeing of pages without any allocation. + */ + batch <<= pcp->free_factor; + if (batch < max_nr_free) + pcp->free_factor++; + batch = clamp(batch, min_nr_free, max_nr_free); + + return batch; +} + static void free_unref_page_commit(struct page *page, unsigned long pfn, int migratetype) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; + int high; __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; - if (pcp->count >= READ_ONCE(pcp->high)) - free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); + high = READ_ONCE(pcp->high); + if (pcp->count >= high) { + int batch = READ_ONCE(pcp->batch); + + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp); + } } /* @@ -3541,7 +3570,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, unsigned long flags; local_lock_irqsave(&pagesets.lock, flags); + + /* + * On allocation, reduce the number of pages that are batch freed. + * See nr_pcp_free() where free_factor is increased for subsequent + * frees. + */ pcp = this_cpu_ptr(zone->per_cpu_pageset); + pcp->free_factor >>= 1; list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); local_unlock_irqrestore(&pagesets.lock, flags); @@ -6737,6 +6773,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta */ pcp->high = BOOT_PAGESET_HIGH; pcp->batch = BOOT_PAGESET_BATCH; + pcp->free_factor = 0; } static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, -- cgit From c49c2c47dab6b8d45022b3fabf0642a0e62e3109 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:21 -0700 Subject: mm/page_alloc: limit the number of pages on PCP lists when reclaim is active When kswapd is active then direct reclaim is potentially active. In either case, it is possible that a zone would be balanced if pages were not trapped on PCP lists. Instead of draining remote pages, simply limit the size of the PCP lists while kswapd is active. Link: https://lkml.kernel.org/r/20210525080119.5455-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e1d1825a2611..adf35ccfd8e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3302,6 +3302,23 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) return batch; } +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) +{ + int high = READ_ONCE(pcp->high); + + if (unlikely(!high)) + return 0; + + if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) + return high; + + /* + * If reclaim is active, limit the number of pages that can be + * stored on pcp lists + */ + return min(READ_ONCE(pcp->batch) << 2, high); +} + static void free_unref_page_commit(struct page *page, unsigned long pfn, int migratetype) { @@ -3313,7 +3330,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn, pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; - high = READ_ONCE(pcp->high); + high = nr_pcp_high(pcp, zone); if (pcp->count >= high) { int batch = READ_ONCE(pcp->batch); -- cgit From 74f44822097c665041010994502b5971d6cd9f04 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:24 -0700 Subject: mm/page_alloc: introduce vm.percpu_pagelist_high_fraction This introduces a new sysctl vm.percpu_pagelist_high_fraction. It is similar to the old vm.percpu_pagelist_fraction. The old sysctl increased both pcp->batch and pcp->high with the higher pcp->high potentially reducing zone->lock contention. However, the higher pcp->batch value also potentially increased allocation latency while the PCP was refilled. This sysctl only adjusts pcp->high so that zone->lock contention is potentially reduced but allocation latency during a PCP refill remains the same. # grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 649 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=8 # grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 35071 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=64 high: 4383 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=0 high: 649 batch: 63 [mgorman@techsingularity.net: fix documentation] Link: https://lkml.kernel.org/r/20210528151010.GQ30378@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Dave Hansen Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 7 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index adf35ccfd8e5..cfc4071310fb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -120,6 +120,7 @@ typedef int __bitwise fpi_t; /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) struct pagesets { local_lock_t lock; @@ -192,6 +193,7 @@ EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; +int percpu_pagelist_high_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); EXPORT_SYMBOL(init_on_alloc); @@ -6725,17 +6727,32 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) #ifdef CONFIG_MMU int high; int nr_local_cpus; + unsigned long total_pages; + + if (!percpu_pagelist_high_fraction) { + /* + * By default, the high value of the pcp is based on the zone + * low watermark so that if they are full then background + * reclaim will not be started prematurely. + */ + total_pages = low_wmark_pages(zone); + } else { + /* + * If percpu_pagelist_high_fraction is configured, the high + * value is based on a fraction of the managed pages in the + * zone. + */ + total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction; + } /* - * The high value of the pcp is based on the zone low watermark - * so that if they are full then background reclaim will not be - * started prematurely. The value is split across all online CPUs - * local to the zone. Note that early in boot that CPUs may not be - * online yet and that during CPU hotplug that the cpumask is not - * yet updated when a CPU is being onlined. + * Split the high value across all online CPUs local to the zone. Note + * that early in boot that CPUs may not be online yet and that during + * CPU hotplug that the cpumask is not yet updated when a CPU is being + * onlined. */ nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online; - high = low_wmark_pages(zone) / nr_local_cpus; + high = total_pages / nr_local_cpus; /* * Ensure high is at least batch*4. The multiple is based on the @@ -8500,6 +8517,44 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } +/* + * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu + * pagelist can have before it gets flushed back to buddy allocator. + */ +int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int old_percpu_pagelist_high_fraction; + int ret; + + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_high_fraction && + percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) { + percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction) + goto out; + + for_each_populated_zone(zone) + zone_set_pageset_high_and_batch(zone, 0); +out: + mutex_unlock(&pcp_batch_high_lock); + return ret; +} + #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but -- cgit From e47aa90568de326625b19d7bc872f8d70b0820b0 Mon Sep 17 00:00:00 2001 From: Dong Aisheng Date: Mon, 28 Jun 2021 19:42:30 -0700 Subject: mm/page_alloc: improve memmap_pages dbg msg Make debug message more accurate. Link: https://lkml.kernel.org/r/20210531091908.1738465-6-aisheng.dong@nxp.com Signed-off-by: Dong Aisheng Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfc4071310fb..2a306c34fda7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7383,7 +7383,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) pr_debug(" %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else - pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", + pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); } -- cgit From f7ec104458e00d27a190348ac3a513f3df3699a4 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 28 Jun 2021 19:42:33 -0700 Subject: mm/page_alloc: fix counting of managed_pages commit f63661566fad ("mm/page_alloc.c: clear out zone->lowmem_reserve[] if the zone is empty") clears out zone->lowmem_reserve[] if zone is empty. But when zone is not empty and sysctl_lowmem_reserve_ratio[i] is set to zero, zone_managed_pages(zone) is not counted in the managed_pages either. This is inconsistent with the description of lowmem_reserve, so fix it. Link: https://lkml.kernel.org/r/20210527125707.3760259-1-liushixin2@huawei.com Fixes: f63661566fad ("mm/page_alloc.c: clear out zone->lowmem_reserve[] if the zone is empty") Signed-off-by: Liu Shixin Reported-by: yangerkun Reviewed-by: Baoquan He Acked-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a306c34fda7..fc151f6a7dbd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8240,14 +8240,14 @@ static void setup_per_zone_lowmem_reserve(void) unsigned long managed_pages = 0; for (j = i + 1; j < MAX_NR_ZONES; j++) { - if (clear) { - zone->lowmem_reserve[j] = 0; - } else { - struct zone *upper_zone = &pgdat->node_zones[j]; + struct zone *upper_zone = &pgdat->node_zones[j]; + + managed_pages += zone_managed_pages(upper_zone); - managed_pages += zone_managed_pages(upper_zone); + if (clear) + zone->lowmem_reserve[j] = 0; + else zone->lowmem_reserve[j] = managed_pages / ratio; - } } } } -- cgit From 21d02f8f8464e27434f477c73431075197a9f72f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:36 -0700 Subject: mm/page_alloc: move free_the_page Patch series "Allow high order pages to be stored on PCP", v2. The per-cpu page allocator (PCP) only handles order-0 pages. With the series "Use local_lock for pcp protection and reduce stat overhead" and "Calculate pcp->high based on zone sizes and active CPUs", it's now feasible to store high-order pages on PCP lists. This small series allows PCP to store "cheap" orders where cheap is determined by PAGE_ALLOC_COSTLY_ORDER and THP-sized allocations. This patch (of 2): In the next page, free_compount_page is going to use the common helper free_the_page. This patch moves the definition to ease review. No functional change. Link: https://lkml.kernel.org/r/20210603142220.10851-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210603142220.10851-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Jesper Dangaard Brouer Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc151f6a7dbd..58f7a321598f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -687,6 +687,14 @@ out: add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +static inline void free_the_page(struct page *page, unsigned int order) +{ + if (order == 0) /* Via pcp? */ + free_unref_page(page); + else + __free_pages_ok(page, order, FPI_NONE); +} + /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -5349,14 +5357,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); -static inline void free_the_page(struct page *page, unsigned int order) -{ - if (order == 0) /* Via pcp? */ - free_unref_page(page); - else - __free_pages_ok(page, order, FPI_NONE); -} - /** * __free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). -- cgit From bb1c50d3967f69f413b333713c2718d48d1ab7ea Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:42:52 -0700 Subject: mm: remove CONFIG_DISCONTIGMEM There are no architectures that support DISCONTIGMEM left. Remove the configuration option and the dead code it was guarding in the generic memory management code. Link: https://lkml.kernel.org/r/20210608091316.3622-6-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 58f7a321598f..8926f3fd3bcf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -349,20 +349,7 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -#ifdef CONFIG_DISCONTIGMEM -/* - * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges - * are not on separate NUMA nodes. Functionally this works but with - * watermark_boost_factor, it can reclaim prematurely as the ranges can be - * quite small. By default, do not boost watermarks on discontigmem as in - * many cases very high-order allocations like THP are likely to be - * unsupported and the premature reclaim offsets the advantage of long-term - * fragmentation avoidance. - */ -int watermark_boost_factor __read_mostly; -#else int watermark_boost_factor __read_mostly = 15000; -#endif int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __initdata; -- cgit From a9ee6cf5c60ed1070e786e53665f9b2f23f2bd11 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:43:01 -0700 Subject: mm: replace CONFIG_NEED_MULTIPLE_NODES with CONFIG_NUMA After removal of DISCINTIGMEM the NEED_MULTIPLE_NODES and NUMA configuration options are equivalent. Drop CONFIG_NEED_MULTIPLE_NODES and use CONFIG_NUMA instead. Done with $ sed -i 's/CONFIG_NEED_MULTIPLE_NODES/CONFIG_NUMA/' \ $(git grep -wl CONFIG_NEED_MULTIPLE_NODES) $ sed -i 's/NEED_MULTIPLE_NODES/NUMA/' \ $(git grep -wl NEED_MULTIPLE_NODES) with manual tweaks afterwards. [rppt@linux.ibm.com: fix arm boot crash] Link: https://lkml.kernel.org/r/YMj9vHhHOiCVN4BF@linux.ibm.com Link: https://lkml.kernel.org/r/20210608091316.3622-9-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8926f3fd3bcf..c4069f9e3968 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1634,7 +1634,7 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); } -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA /* * During memory init memblocks map pfns to nids. The search is expensive and @@ -1684,7 +1684,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) return nid; } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) @@ -7438,7 +7438,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", __func__, pgdat->node_id, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA /* * With no DISCONTIG, the global mem_map is just set as node 0's */ -- cgit From 43b02ba93b25b1caff7a3457fc5d005485e78da5 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:43:05 -0700 Subject: mm: replace CONFIG_FLAT_NODE_MEM_MAP with CONFIG_FLATMEM After removal of the DISCONTIGMEM memory model the FLAT_NODE_MEM_MAP configuration option is equivalent to FLATMEM. Drop CONFIG_FLAT_NODE_MEM_MAP and use CONFIG_FLATMEM instead. Link: https://lkml.kernel.org/r/20210608091316.3622-10-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c4069f9e3968..0e441f1677f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6547,7 +6547,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#if !defined(CONFIG_FLAT_NODE_MEM_MAP) +#if !defined(CONFIG_FLATMEM) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during @@ -7403,7 +7403,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) } } -#ifdef CONFIG_FLAT_NODE_MEM_MAP +#ifdef CONFIG_FLATMEM static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; @@ -7451,7 +7451,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) } #else static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } -#endif /* CONFIG_FLAT_NODE_MEM_MAP */ +#endif /* CONFIG_FLATMEM */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void pgdat_set_deferred_range(pg_data_t *pgdat) -- cgit From 44042b4498728f4376e84bae1ac8016d146d850b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:43:08 -0700 Subject: mm/page_alloc: allow high-order pages to be stored on the per-cpu lists The per-cpu page allocator (PCP) only stores order-0 pages. This means that all THP and "cheap" high-order allocations including SLUB contends on the zone->lock. This patch extends the PCP allocator to store THP and "cheap" high-order pages. Note that struct per_cpu_pages increases in size to 256 bytes (4 cache lines) on x86-64. Note that this is not necessarily a universal performance win because of how it is implemented. High-order pages can cause pcp->high to be exceeded prematurely for lower-orders so for example, a large number of THP pages being freed could release order-0 pages from the PCP lists. Hence, much depends on the allocation/free pattern as observed by a single CPU to determine if caching helps or hurts a particular workload. That said, basic performance testing passed. The following is a netperf UDP_STREAM test which hits the relevant patches as some of the network allocations are high-order. netperf-udp 5.13.0-rc2 5.13.0-rc2 mm-pcpburst-v3r4 mm-pcphighorder-v1r7 Hmean send-64 261.46 ( 0.00%) 266.30 * 1.85%* Hmean send-128 516.35 ( 0.00%) 536.78 * 3.96%* Hmean send-256 1014.13 ( 0.00%) 1034.63 * 2.02%* Hmean send-1024 3907.65 ( 0.00%) 4046.11 * 3.54%* Hmean send-2048 7492.93 ( 0.00%) 7754.85 * 3.50%* Hmean send-3312 11410.04 ( 0.00%) 11772.32 * 3.18%* Hmean send-4096 13521.95 ( 0.00%) 13912.34 * 2.89%* Hmean send-8192 21660.50 ( 0.00%) 22730.72 * 4.94%* Hmean send-16384 31902.32 ( 0.00%) 32637.50 * 2.30%* Functionally, a patch like this is necessary to make bulk allocation of high-order pages work with similar performance to order-0 bulk allocations. The bulk allocator is not updated in this series as it would have to be determined by bulk allocation users how they want to track the order of pages allocated with the bulk allocator. Link: https://lkml.kernel.org/r/20210611135753.GC30378@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Zi Yan Cc: Dave Hansen Cc: Michal Hocko Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 169 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 123 insertions(+), 46 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0e441f1677f3..34f097ecfe08 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -674,10 +674,53 @@ out: add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +static inline unsigned int order_to_pindex(int migratetype, int order) +{ + int base = order; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + VM_BUG_ON(order != pageblock_order); + base = PAGE_ALLOC_COSTLY_ORDER + 1; + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return (MIGRATE_PCPTYPES * base) + migratetype; +} + +static inline int pindex_to_order(unsigned int pindex) +{ + int order = pindex / MIGRATE_PCPTYPES; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + order = pageblock_order; + VM_BUG_ON(order != pageblock_order); + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return order; +} + +static inline bool pcp_allowed_order(unsigned int order) +{ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return true; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order == pageblock_order) + return true; +#endif + return false; +} + static inline void free_the_page(struct page *page, unsigned int order) { - if (order == 0) /* Via pcp? */ - free_unref_page(page); + if (pcp_allowed_order(order)) /* Via pcp? */ + free_unref_page(page, order); else __free_pages_ok(page, order, FPI_NONE); } @@ -700,7 +743,7 @@ static inline void free_the_page(struct page *page, unsigned int order) void free_compound_page(struct page *page) { mem_cgroup_uncharge(page); - __free_pages_ok(page, compound_order(page), FPI_NONE); + free_the_page(page, compound_order(page)); } void prep_compound_page(struct page *page, unsigned int order) @@ -1350,9 +1393,9 @@ static __always_inline bool free_pages_prepare(struct page *page, * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when * moved from pcp lists to free lists. */ -static bool free_pcp_prepare(struct page *page) +static bool free_pcp_prepare(struct page *page, unsigned int order) { - return free_pages_prepare(page, 0, true, FPI_NONE); + return free_pages_prepare(page, order, true, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1369,12 +1412,12 @@ static bool bulkfree_pcp_prepare(struct page *page) * debug_pagealloc enabled, they are checked also immediately when being freed * to the pcp lists. */ -static bool free_pcp_prepare(struct page *page) +static bool free_pcp_prepare(struct page *page, unsigned int order) { if (debug_pagealloc_enabled_static()) - return free_pages_prepare(page, 0, true, FPI_NONE); + return free_pages_prepare(page, order, true, FPI_NONE); else - return free_pages_prepare(page, 0, false, FPI_NONE); + return free_pages_prepare(page, order, false, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1406,8 +1449,10 @@ static inline void prefetch_buddy(struct page *page) static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) { - int migratetype = 0; + int pindex = 0; int batch_free = 0; + int nr_freed = 0; + unsigned int order; int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; struct page *page, *tmp; @@ -1418,7 +1463,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, * below while (list_empty(list)) loop. */ count = min(pcp->count, count); - while (count) { + while (count > 0) { struct list_head *list; /* @@ -1430,24 +1475,31 @@ static void free_pcppages_bulk(struct zone *zone, int count, */ do { batch_free++; - if (++migratetype == MIGRATE_PCPTYPES) - migratetype = 0; - list = &pcp->lists[migratetype]; + if (++pindex == NR_PCP_LISTS) + pindex = 0; + list = &pcp->lists[pindex]; } while (list_empty(list)); /* This is the only non-empty list. Free them all. */ - if (batch_free == MIGRATE_PCPTYPES) + if (batch_free == NR_PCP_LISTS) batch_free = count; + order = pindex_to_order(pindex); + BUILD_BUG_ON(MAX_ORDER >= (1<lru); - pcp->count--; + nr_freed += 1 << order; + count -= 1 << order; if (bulkfree_pcp_prepare(page)) continue; + /* Encode order with the migratetype */ + page->index <<= NR_PCP_ORDER_WIDTH; + page->index |= order; + list_add_tail(&page->lru, &head); /* @@ -1463,8 +1515,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, prefetch_buddy(page); prefetch_nr--; } - } while (--count && --batch_free && !list_empty(list)); + } while (count > 0 && --batch_free && !list_empty(list)); } + pcp->count -= nr_freed; /* * local_lock_irq held so equivalent to spin_lock_irqsave for @@ -1479,14 +1532,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, */ list_for_each_entry_safe(page, tmp, &head, lru) { int mt = get_pcppage_migratetype(page); + + /* mt has been encoded with the order (see above) */ + order = mt & NR_PCP_ORDER_MASK; + mt >>= NR_PCP_ORDER_WIDTH; + /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); - trace_mm_page_pcpu_drain(page, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); } spin_unlock(&zone->lock); } @@ -3263,11 +3321,12 @@ void mark_free_pages(struct zone *zone) } #endif /* CONFIG_PM */ -static bool free_unref_page_prepare(struct page *page, unsigned long pfn) +static bool free_unref_page_prepare(struct page *page, unsigned long pfn, + unsigned int order) { int migratetype; - if (!free_pcp_prepare(page)) + if (!free_pcp_prepare(page, order)) return false; migratetype = get_pfnblock_migratetype(page, pfn); @@ -3317,16 +3376,18 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) } static void free_unref_page_commit(struct page *page, unsigned long pfn, - int migratetype) + int migratetype, unsigned int order) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; int high; + int pindex; __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); - list_add(&page->lru, &pcp->lists[migratetype]); - pcp->count++; + pindex = order_to_pindex(migratetype, order); + list_add(&page->lru, &pcp->lists[pindex]); + pcp->count += 1 << order; high = nr_pcp_high(pcp, zone); if (pcp->count >= high) { int batch = READ_ONCE(pcp->batch); @@ -3336,15 +3397,15 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn, } /* - * Free a 0-order page + * Free a pcp page */ -void free_unref_page(struct page *page) +void free_unref_page(struct page *page, unsigned int order) { unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; - if (!free_unref_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn, order)) return; /* @@ -3357,14 +3418,14 @@ void free_unref_page(struct page *page) migratetype = get_pcppage_migratetype(page); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; } local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn, migratetype); + free_unref_page_commit(page, pfn, migratetype, order); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3381,7 +3442,7 @@ void free_unref_page_list(struct list_head *list) /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn, 0)) list_del(&page->lru); /* @@ -3413,7 +3474,7 @@ void free_unref_page_list(struct list_head *list) set_page_private(page, 0); migratetype = get_pcppage_migratetype(page); trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn, migratetype); + free_unref_page_commit(page, pfn, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get @@ -3549,7 +3610,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, /* Remove page from the per-cpu list, caller must protect the list */ static inline -struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, + int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) @@ -3558,16 +3620,30 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, do { if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - READ_ONCE(pcp->batch), list, + int batch = READ_ONCE(pcp->batch); + int alloced; + + /* + * Scale batch relative to order if batch implies + * free pages can be stored on the PCP. Batch can + * be 1 for small zones or for boot pagesets which + * should never store free pages as the pages may + * belong to arbitrary zones. + */ + if (batch > 1) + batch = max(batch >> order, 2); + alloced = rmqueue_bulk(zone, order, + batch, list, migratetype, alloc_flags); + + pcp->count += alloced << order; if (unlikely(list_empty(list))) return NULL; } page = list_first_entry(list, struct page, lru); list_del(&page->lru); - pcp->count--; + pcp->count -= 1 << order; } while (check_new_pcp(page)); return page; @@ -3575,8 +3651,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, - struct zone *zone, gfp_t gfp_flags, - int migratetype, unsigned int alloc_flags) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype, + unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3592,8 +3669,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, */ pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; - list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); + list = &pcp->lists[order_to_pindex(migratetype, order)]; + page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); @@ -3614,15 +3691,15 @@ struct page *rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; - if (likely(order == 0)) { + if (likely(pcp_allowed_order(order))) { /* * MIGRATE_MOVABLE pcplist could have the pages on CMA area and * we need to skip it when CMA area isn't allowed. */ if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || migratetype != MIGRATE_MOVABLE) { - page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, - migratetype, alloc_flags); + page = rmqueue_pcplist(preferred_zone, zone, order, + gfp_flags, migratetype, alloc_flags); goto out; } } @@ -5201,7 +5278,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Attempt the batch allocation */ local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); - pcp_list = &pcp->lists[ac.migratetype]; + pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; while (nr_populated < nr_pages) { @@ -5211,7 +5288,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, continue; } - page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, + page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, pcp, pcp_list); if (unlikely(!page)) { /* Try and get at least one page */ @@ -6778,13 +6855,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) { - int migratetype; + int pindex; memset(pcp, 0, sizeof(*pcp)); memset(pzstats, 0, sizeof(*pzstats)); - for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) - INIT_LIST_HEAD(&pcp->lists[migratetype]); + for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) + INIT_LIST_HEAD(&pcp->lists[pindex]); /* * Set batch and high values safe for a boot pageset. A true percpu -- cgit From 203c06eef579c670b8eb3a24108b9837bf9b7737 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:43:11 -0700 Subject: mm/page_alloc: split pcp->high across all online CPUs for cpuless nodes Dave Hansen reported the following about Feng Tang's tests on a machine with persistent memory onlined as a DRAM-like device. Feng Tang tossed these on a "Cascade Lake" system with 96 threads and ~512G of persistent memory and 128G of DRAM. The PMEM is in "volatile use" mode and being managed via the buddy just like the normal RAM. The PMEM zones are big ones: present 65011712 = 248 G high 134595 = 525 M The PMEM nodes, of course, don't have any CPUs in them. With your series, the pcp->high value per-cpu is 69584 pages or about 270MB per CPU. Scaled up by the 96 CPU threads, that's ~26GB of worst-case memory in the pcps per zone, or roughly 10% of the size of the zone. This should not cause a problem as such although it could trigger reclaim due to pages being stored on per-cpu lists for CPUs remote to a node. It is not possible to treat cpuless nodes exactly the same as normal nodes but the worst-case scenario can be mitigated by splitting pcp->high across all online CPUs for cpuless memory nodes. Link: https://lkml.kernel.org/r/20210616110743.GK30378@techsingularity.net Suggested-by: Dave Hansen Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Cc: "Tang, Feng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 34f097ecfe08..db00ee8d79d2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6790,7 +6790,7 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU int high; - int nr_local_cpus; + int nr_split_cpus; unsigned long total_pages; if (!percpu_pagelist_high_fraction) { @@ -6813,10 +6813,14 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) * Split the high value across all online CPUs local to the zone. Note * that early in boot that CPUs may not be online yet and that during * CPU hotplug that the cpumask is not yet updated when a CPU is being - * onlined. - */ - nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online; - high = total_pages / nr_local_cpus; + * onlined. For memory nodes that have no CPUs, split pcp->high across + * all online CPUs to mitigate the risk that reclaim is triggered + * prematurely due to pages stored on pcp lists. + */ + nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online; + if (!nr_split_cpus) + nr_split_cpus = num_online_cpus(); + high = total_pages / nr_split_cpus; /* * Ensure high is at least batch*4. The multiple is based on the -- cgit