From c24d37322548a6ec3caec67100d28b9c1f89f60a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 28 Jun 2021 19:33:23 -0700 Subject: mm/gup: fix try_grab_compound_head() race with split_huge_page() try_grab_compound_head() is used to grab a reference to a page from get_user_pages_fast(), which is only protected against concurrent freeing of page tables (via local_irq_save()), but not against concurrent TLB flushes, freeing of data pages, or splitting of compound pages. Because no reference is held to the page when try_grab_compound_head() is called, the page may have been freed and reallocated by the time its refcount has been elevated; therefore, once we're holding a stable reference to the page, the caller re-checks whether the PTE still points to the same page (with the same access rights). The problem is that try_grab_compound_head() has to grab a reference on the head page; but between the time we look up what the head page is and the time we actually grab a reference on the head page, the compound page may have been split up (either explicitly through split_huge_page() or by freeing the compound page to the buddy allocator and then allocating its individual order-0 pages). If that happens, get_user_pages_fast() may end up returning the right page but lifting the refcount on a now-unrelated page, leading to use-after-free of pages. To fix it: Re-check whether the pages still belong together after lifting the refcount on the head page. Move anything else that checks compound_head(page) below the refcount increment. This can't actually happen on bare-metal x86 (because there, disabling IRQs locks out remote TLB flushes), but it can happen on virtualized x86 (e.g. under KVM) and probably also on arm64. The race window is pretty narrow, and constantly allocating and shattering hugepages isn't exactly fast; for now I've only managed to reproduce this in an x86 KVM guest with an artificially widened timing window (by adding a loop that repeatedly calls `inl(0x3f8 + 5)` in `try_get_compound_head()` to force VM exits, so that PV TLB flushes are used instead of IPIs). As requested on the list, also replace the existing VM_BUG_ON_PAGE() with a warning and bailout. Since the existing code only performed the BUG_ON check on DEBUG_VM kernels, ensure that the new code also only performs the check under that configuration - I don't want to mix two logically separate changes together too much. The macro VM_WARN_ON_ONCE_PAGE() doesn't return a value on !DEBUG_VM, so wrap the whole check in an #ifdef block. An alternative would be to change the VM_WARN_ON_ONCE_PAGE() definition for !DEBUG_VM such that it always returns false, but since that would differ from the behavior of the normal WARN macros, it might be too confusing for readers. Link: https://lkml.kernel.org/r/20210615012014.1100672-1-jannh@google.com Fixes: 7aef4172c795 ("mm: handle PTE-mapped tail pages in gerneric fast gup implementaiton") Signed-off-by: Jann Horn Reviewed-by: John Hubbard Cc: Matthew Wilcox Cc: Kirill A. Shutemov Cc: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 58 +++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 3ded6a5f26b2..90262e448552 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -44,6 +44,23 @@ static void hpage_pincount_sub(struct page *page, int refs) atomic_sub(refs, compound_pincount_ptr(page)); } +/* Equivalent to calling put_page() @refs times. */ +static void put_page_refs(struct page *page, int refs) +{ +#ifdef CONFIG_DEBUG_VM + if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page)) + return; +#endif + + /* + * Calling put_page() for each ref is unnecessarily slow. Only the last + * ref needs a put_page(). + */ + if (refs > 1) + page_ref_sub(page, refs - 1); + put_page(page); +} + /* * Return the compound head page with ref appropriately incremented, * or NULL if that failed. @@ -56,6 +73,21 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) return NULL; if (unlikely(!page_cache_add_speculative(head, refs))) return NULL; + + /* + * At this point we have a stable reference to the head page; but it + * could be that between the compound_head() lookup and the refcount + * increment, the compound page was split, in which case we'd end up + * holding a reference on a page that has nothing to do with the page + * we were given anymore. + * So now that the head page is stable, recheck that the pages still + * belong together. + */ + if (unlikely(compound_head(page) != head)) { + put_page_refs(head, refs); + return NULL; + } + return head; } @@ -95,6 +127,14 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, !is_pinnable_page(page))) return NULL; + /* + * CAUTION: Don't use compound_head() on the page before this + * point, the result won't be stable. + */ + page = try_get_compound_head(page, refs); + if (!page) + return NULL; + /* * When pinning a compound page of order > 1 (which is what * hpage_pincount_available() checks for), use an exact count to @@ -103,15 +143,10 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, * However, be sure to *also* increment the normal page refcount * field at least once, so that the page really is pinned. */ - if (!hpage_pincount_available(page)) - refs *= GUP_PIN_COUNTING_BIAS; - - page = try_get_compound_head(page, refs); - if (!page) - return NULL; - if (hpage_pincount_available(page)) hpage_pincount_add(page, refs); + else + page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1)); mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, orig_refs); @@ -135,14 +170,7 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags) refs *= GUP_PIN_COUNTING_BIAS; } - VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); - /* - * Calling put_page() for each ref is unnecessarily slow. Only the last - * ref needs a put_page(). - */ - if (refs > 1) - page_ref_sub(page, refs - 1); - put_page(page); + put_page_refs(page, refs); } /** -- cgit From 122e093c1734361dedb64f65c99b93e28e4624f4 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:33:26 -0700 Subject: mm/page_alloc: fix memory map initialization for descending nodes On systems with memory nodes sorted in descending order, for instance Dell Precision WorkStation T5500, the struct pages for higher PFNs and respectively lower nodes, could be overwritten by the initialization of struct pages corresponding to the holes in the memory sections. For example for the below memory layout [ 0.245624] Early memory node ranges [ 0.248496] node 1: [mem 0x0000000000001000-0x0000000000090fff] [ 0.251376] node 1: [mem 0x0000000000100000-0x00000000dbdf8fff] [ 0.254256] node 1: [mem 0x0000000100000000-0x0000001423ffffff] [ 0.257144] node 0: [mem 0x0000001424000000-0x0000002023ffffff] the range 0x1424000000 - 0x1428000000 in the beginning of node 0 starts in the middle of a section and will be considered as a hole during the initialization of the last section in node 1. The wrong initialization of the memory map causes panic on boot when CONFIG_DEBUG_VM is enabled. Reorder loop order of the memory map initialization so that the outer loop will always iterate over populated memory regions in the ascending order and the inner loop will select the zone corresponding to the PFN range. This way initialization of the struct pages for the memory holes will be always done for the ranges that are actually not populated. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/YNXlMqBbL+tBG7yq@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=213073 Link: https://lkml.kernel.org/r/20210624062305.10940-1-rppt@kernel.org Fixes: 0740a50b9baa ("mm/page_alloc.c: refactor initialization of struct page for holes in memory layout") Signed-off-by: Mike Rapoport Cc: Boris Petkov Cc: Robert Shteynfeld Cc: Baoquan He Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/page_alloc.c | 94 +++++++++++++++++++++++++++++++++--------------------- 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8ae31622deef..9afb8998e7e5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2474,7 +2474,6 @@ extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); -extern void memmap_init_zone(struct zone *zone); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef2265f86b91..5b5c9f5813b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6400,7 +6400,7 @@ void __ref memmap_init_zone_device(struct zone *zone, return; /* - * The call to memmap_init_zone should have already taken care + * The call to memmap_init should have already taken care * of the pages reserved for the memmap, so we can just jump to * the end of that region and start processing the device pages. */ @@ -6465,7 +6465,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during - * memmap_init_zone(). + * memmap_init_zone_range(). * * But, there could be struct pages that correspond to holes in * memblock.memory. This can happen because of the following reasons: @@ -6484,9 +6484,9 @@ static void __meminit zone_init_free_lists(struct zone *zone) * zone/node above the hole except for the trailing pages in the last * section that will be appended to the zone/node below. */ -static u64 __meminit init_unavailable_range(unsigned long spfn, - unsigned long epfn, - int zone, int node) +static void __init init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) { unsigned long pfn; u64 pgcnt = 0; @@ -6502,56 +6502,77 @@ static u64 __meminit init_unavailable_range(unsigned long spfn, pgcnt++; } - return pgcnt; + if (pgcnt) + pr_info("On node %d, zone %s: %lld pages in unavailable ranges", + node, zone_names[zone], pgcnt); } #else -static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn, - int zone, int node) +static inline void init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) { - return 0; } #endif -void __meminit __weak memmap_init_zone(struct zone *zone) +static void __init memmap_init_zone_range(struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long *hole_pfn) { unsigned long zone_start_pfn = zone->zone_start_pfn; unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; - int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); - static unsigned long hole_pfn; + int nid = zone_to_nid(zone), zone_id = zone_idx(zone); + + start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); + end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); + + if (start_pfn >= end_pfn) + return; + + memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + + if (*hole_pfn < start_pfn) + init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); + + *hole_pfn = end_pfn; +} + +static void __init memmap_init(void) +{ unsigned long start_pfn, end_pfn; - u64 pgcnt = 0; + unsigned long hole_pfn = 0; + int i, j, zone_id, nid; - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); - end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + struct pglist_data *node = NODE_DATA(nid); + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = node->node_zones + j; - if (end_pfn > start_pfn) - memmap_init_range(end_pfn - start_pfn, nid, - zone_id, start_pfn, zone_end_pfn, - MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + if (!populated_zone(zone)) + continue; - if (hole_pfn < start_pfn) - pgcnt += init_unavailable_range(hole_pfn, start_pfn, - zone_id, nid); - hole_pfn = end_pfn; + memmap_init_zone_range(zone, start_pfn, end_pfn, + &hole_pfn); + zone_id = j; + } } #ifdef CONFIG_SPARSEMEM /* - * Initialize the hole in the range [zone_end_pfn, section_end]. - * If zone boundary falls in the middle of a section, this hole - * will be re-initialized during the call to this function for the - * higher zone. + * Initialize the memory map for hole in the range [memory_end, + * section_end]. + * Append the pages in this hole to the highest zone in the last + * node. + * The call to init_unavailable_range() is outside the ifdef to + * silence the compiler warining about zone_id set but not used; + * for FLATMEM it is a nop anyway */ - end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION); + end_pfn = round_up(end_pfn, PAGES_PER_SECTION); if (hole_pfn < end_pfn) - pgcnt += init_unavailable_range(hole_pfn, end_pfn, - zone_id, nid); #endif - - if (pgcnt) - pr_info(" %s zone: %llu pages in unavailable ranges\n", - zone->name, pgcnt); + init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } static int zone_batchsize(struct zone *zone) @@ -7254,7 +7275,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(zone); init_currently_empty_zone(zone, zone->zone_start_pfn, size); - memmap_init_zone(zone); } } @@ -7780,6 +7800,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } + + memmap_init(); } static int __init cmdline_parse_core(char *p, unsigned long *core, -- cgit From ff4b2b4014cbffb3d32b22629252f4dc8616b0fe Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:33:29 -0700 Subject: mm/page_alloc: correct return value of populated elements if bulk array is populated Dave Jones reported the following This made it into 5.13 final, and completely breaks NFSD for me (Serving tcp v3 mounts). Existing mounts on clients hang, as do new mounts from new clients. Rebooting the server back to rc7 everything recovers. The commit b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after checking populated elements") returns the wrong value if the array is already populated which is interpreted as an allocation failure. Dave reported this fixes his problem and it also passed a test running dbench over NFS. Link: https://lkml.kernel.org/r/20210628150219.GC3840@techsingularity.net Fixes: b3b64ebd3822 ("mm/page_alloc: do bulk array bounds check after checking populated elements") Signed-off-by: Mel Gorman Reported-by: Dave Jones Tested-by: Dave Jones Cc: Dan Carpenter Cc: Jesper Dangaard Brouer Cc: Vlastimil Babka Cc: [5.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b5c9f5813b9..2bf03c76504b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5058,7 +5058,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Already populated array? */ if (unlikely(page_array && nr_pages - nr_populated == 0)) - return 0; + return nr_populated; /* Use the single page allocator for one page. */ if (nr_pages - nr_populated == 1) -- cgit From 20ce0c2d5a303c41c0e02ceb596837868e290dcc Mon Sep 17 00:00:00 2001 From: Jonathan Neuschäfer Date: Mon, 28 Jun 2021 19:33:32 -0700 Subject: kthread: switch to new kerneldoc syntax for named variable macro argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The syntax without dots is available since commit 43756e347f21 ("scripts/kernel-doc: Add support for named variable macro arguments"). The same HTML output is produced with and without this patch. Link: https://lkml.kernel.org/r/20210513161702.1721039-1-j.neuschaefer@gmx.net Signed-off-by: Jonathan Neuschäfer Cc: Jens Axboe Cc: Felix Kuehling Cc: Valentin Schneider Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 2484ed97e72f..db3eafea168f 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -18,7 +18,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), * @threadfn: the function to run in the thread * @data: data pointer for @threadfn() * @namefmt: printf-style format string for the thread name - * @arg...: arguments for @namefmt. + * @arg: arguments for @namefmt. * * This macro will create a kthread on the current node, leaving it in * the stopped state. This is just a helper for kthread_create_on_node(); -- cgit From d71ba1649fa3c464c51ec7163e4b817345bff2c7 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 28 Jun 2021 19:33:35 -0700 Subject: kthread_worker: fix return value when kthread_mod_delayed_work() races with kthread_cancel_delayed_work_sync() kthread_mod_delayed_work() might race with kthread_cancel_delayed_work_sync() or another kthread_mod_delayed_work() call. The function lets the other operation win when it sees work->canceling counter set. And it returns @false. But it should return @true as it is done by the related workqueue API, see mod_delayed_work_on(). The reason is that the return value might be used for reference counting. It has to distinguish the case when the number of queued works has changed or stayed the same. The change is safe. kthread_mod_delayed_work() return value is not checked anywhere at the moment. Link: https://lore.kernel.org/r/20210521163526.GA17916@redhat.com Link: https://lkml.kernel.org/r/20210610133051.15337-4-pmladek@suse.com Signed-off-by: Petr Mladek Reported-by: Oleg Nesterov Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tejun Heo Cc: Minchan Kim Cc: Cc: Martin Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index 0fccf7d0c6a1..86ae5f2e6db8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1156,14 +1156,14 @@ static bool __kthread_cancel_work(struct kthread_work *work) * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * - * Return: %true if @dwork was pending and its timer was modified, - * %false otherwise. + * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command - * win and return %false here. The caller is supposed to synchronize these - * operations a reasonable way. + * win and return %true here. The return value can be used for reference + * counting and the number of queued works stays the same. Anyway, the caller + * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() @@ -1175,13 +1175,15 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, { struct kthread_work *work = &dwork->work; unsigned long flags; - int ret = false; + int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ - if (!work->worker) + if (!work->worker) { + ret = false; goto fast_queue; + } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); @@ -1199,8 +1201,11 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); - if (work->canceling) + if (work->canceling) { + /* The number of works in the queue does not change. */ + ret = true; goto out; + } ret = __kthread_cancel_work(work); fast_queue: -- cgit From f589c67ff08c82405f3e69603ac159ea76933a50 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Jun 2021 19:33:38 -0700 Subject: ia64: headers: drop duplicated words Delete the repeated words "to" and "the". Link: https://lkml.kernel.org/r/20210507184837.10754-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Fenghua Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/pal.h | 2 +- arch/ia64/include/asm/spinlock.h | 2 +- arch/ia64/include/asm/uv/uv_hub.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/ia64/include/asm/pal.h b/arch/ia64/include/asm/pal.h index b1d87955e8cc..5c51fceedaf9 100644 --- a/arch/ia64/include/asm/pal.h +++ b/arch/ia64/include/asm/pal.h @@ -1086,7 +1086,7 @@ static inline long ia64_pal_freq_base(unsigned long *platform_base_freq) /* * Get the ratios for processor frequency, bus frequency and interval timer to - * to base frequency of the platform + * the base frequency of the platform */ static inline s64 ia64_pal_freq_ratios (struct pal_freq_ratio *proc_ratio, struct pal_freq_ratio *bus_ratio, diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index 5f620e66384e..864775970c50 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -26,7 +26,7 @@ * the queue, and the other indicating the current tail. The lock is acquired * by atomically noting the tail and incrementing it by one (thus adding * ourself to the queue and noting our position), then waiting until the head - * becomes equal to the the initial value of the tail. + * becomes equal to the initial value of the tail. * The pad bits in the middle are used to prevent the next_ticket number * overflowing into the now_serving number. * diff --git a/arch/ia64/include/asm/uv/uv_hub.h b/arch/ia64/include/asm/uv/uv_hub.h index 2a88c7204e52..809ddb6896db 100644 --- a/arch/ia64/include/asm/uv/uv_hub.h +++ b/arch/ia64/include/asm/uv/uv_hub.h @@ -257,7 +257,7 @@ static inline int uv_numa_blade_id(void) return 0; } -/* Convert a cpu number to the the UV blade number */ +/* Convert a cpu number to the UV blade number */ static inline int uv_cpu_to_blade_id(int cpu) { return 0; -- cgit From c5f320ff8a79501bb59338278336ec43acb9d7e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 28 Jun 2021 19:33:41 -0700 Subject: ia64: mca_drv: fix incorrect array size calculation gcc points out a mistake in the mca driver that goes back to before the git history: arch/ia64/kernel/mca_drv.c: In function 'init_record_index_pools': arch/ia64/kernel/mca_drv.c:346:54: error: expression does not compute the number of elements in this array; element typ e is 'int', not 'size_t' {aka 'long unsigned int'} [-Werror=sizeof-array-div] 346 | for (i = 1; i < sizeof sal_log_sect_min_sizes/sizeof(size_t); i++) | ^ This is the same as sizeof(size_t), which is two shorter than the actual array. Use the ARRAY_SIZE() macro to get the correct calculation instead. Link: https://lkml.kernel.org/r/20210514214123.875971-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Masahiro Yamada Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/mca_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c index 36a69b4e6169..5bfc79be4cef 100644 --- a/arch/ia64/kernel/mca_drv.c +++ b/arch/ia64/kernel/mca_drv.c @@ -343,7 +343,7 @@ init_record_index_pools(void) /* - 2 - */ sect_min_size = sal_log_sect_min_sizes[0]; - for (i = 1; i < sizeof sal_log_sect_min_sizes/sizeof(size_t); i++) + for (i = 1; i < ARRAY_SIZE(sal_log_sect_min_sizes); i++) if (sect_min_size > sal_log_sect_min_sizes[i]) sect_min_size = sal_log_sect_min_sizes[i]; -- cgit From b83c8ba40cebcee1d07cb852c23d616acf8988b7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 28 Jun 2021 19:33:44 -0700 Subject: streamline_config.pl: make spacing consistent Patch series "streamline_config.pl: Fix Perl spacing". Talking with John Hawley about how vim and emacs deal with Perl files with respect to tabs and spaces, I found that some of my Perl code in the kernel had inconsistent spacing. The way emacs handles Perl by default is to use 4 spaces per indent, but make all 8 spaces into a single tab. Vim does not do this by default. But if you add the vim variable control: # vim: softtabstop=4 to a perl file, it makes vim behave the same way as emacs. The first patch is to change all 8 spaces into a single tab (mostly from people editing the file with vim). The next patch adds the softtabstop variable to make vim act like emacs by default. This patch (of 2): As Perl code tends to have 4 space indentation, but uses tabs for every 8 spaces, make that consistent in the streamline_config.pl code. Replace all 8 spaces with a single tab. Link: https://lkml.kernel.org/r/20210322214032.133596267@goodmis.org Signed-off-by: Steven Rostedt (VMware) Cc: "John (Warthog9) Hawley" Cc: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/kconfig/streamline_config.pl | 78 ++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/scripts/kconfig/streamline_config.pl b/scripts/kconfig/streamline_config.pl index 911c72a2dbc4..2e65aa9edf97 100755 --- a/scripts/kconfig/streamline_config.pl +++ b/scripts/kconfig/streamline_config.pl @@ -601,12 +601,12 @@ if (defined($ENV{'LMC_KEEP'})) { sub in_preserved_kconfigs { my $kconfig = $config2kfile{$_[0]}; if (!defined($kconfig)) { - return 0; + return 0; } foreach my $excl (@preserved_kconfigs) { - if($kconfig =~ /^$excl/) { - return 1; - } + if($kconfig =~ /^$excl/) { + return 1; + } } return 0; } @@ -629,52 +629,52 @@ foreach my $line (@config_file) { } if (/CONFIG_MODULE_SIG_KEY="(.+)"/) { - my $orig_cert = $1; - my $default_cert = "certs/signing_key.pem"; - - # Check that the logic in this script still matches the one in Kconfig - if (!defined($depends{"MODULE_SIG_KEY"}) || - $depends{"MODULE_SIG_KEY"} !~ /"\Q$default_cert\E"/) { - print STDERR "WARNING: MODULE_SIG_KEY assertion failure, ", - "update needed to ", __FILE__, " line ", __LINE__, "\n"; - print; - } elsif ($orig_cert ne $default_cert && ! -f $orig_cert) { - print STDERR "Module signature verification enabled but ", - "module signing key \"$orig_cert\" not found. Resetting ", - "signing key to default value.\n"; - print "CONFIG_MODULE_SIG_KEY=\"$default_cert\"\n"; - } else { - print; - } - next; + my $orig_cert = $1; + my $default_cert = "certs/signing_key.pem"; + + # Check that the logic in this script still matches the one in Kconfig + if (!defined($depends{"MODULE_SIG_KEY"}) || + $depends{"MODULE_SIG_KEY"} !~ /"\Q$default_cert\E"/) { + print STDERR "WARNING: MODULE_SIG_KEY assertion failure, ", + "update needed to ", __FILE__, " line ", __LINE__, "\n"; + print; + } elsif ($orig_cert ne $default_cert && ! -f $orig_cert) { + print STDERR "Module signature verification enabled but ", + "module signing key \"$orig_cert\" not found. Resetting ", + "signing key to default value.\n"; + print "CONFIG_MODULE_SIG_KEY=\"$default_cert\"\n"; + } else { + print; + } + next; } if (/CONFIG_SYSTEM_TRUSTED_KEYS="(.+)"/) { - my $orig_keys = $1; - - if (! -f $orig_keys) { - print STDERR "System keyring enabled but keys \"$orig_keys\" ", - "not found. Resetting keys to default value.\n"; - print "CONFIG_SYSTEM_TRUSTED_KEYS=\"\"\n"; - } else { - print; - } - next; + my $orig_keys = $1; + + if (! -f $orig_keys) { + print STDERR "System keyring enabled but keys \"$orig_keys\" ", + "not found. Resetting keys to default value.\n"; + print "CONFIG_SYSTEM_TRUSTED_KEYS=\"\"\n"; + } else { + print; + } + next; } if (/^(CONFIG.*)=(m|y)/) { - if (in_preserved_kconfigs($1)) { - dprint "Preserve config $1"; - print; - next; - } + if (in_preserved_kconfigs($1)) { + dprint "Preserve config $1"; + print; + next; + } if (defined($configs{$1})) { if ($localyesconfig) { - $setconfigs{$1} = 'y'; + $setconfigs{$1} = 'y'; print "$1=y\n"; next; } else { - $setconfigs{$1} = $2; + $setconfigs{$1} = $2; } } elsif ($2 eq "m") { print "# $1 is not set\n"; -- cgit From d1b1f1e627c0085fb2e2b5690929a3d53879cc67 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 28 Jun 2021 19:33:47 -0700 Subject: streamline_config.pl: add softtabstop=4 for vim users The tab stop for Perl files is by default (at least in emacs) to be 4 spaces, where a tab is used for all 8 spaces. Add a local variable comment to make vim do the same by default, and this will help keep the file consistent in the future when others edit it via vim and not emacs. Link: https://lkml.kernel.org/r/20210322214032.293992979@goodmis.org Signed-off-by: Steven Rostedt (VMware) Cc: Masahiro Yamada Cc: "John (Warthog9) Hawley" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/kconfig/streamline_config.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/kconfig/streamline_config.pl b/scripts/kconfig/streamline_config.pl index 2e65aa9edf97..1a5fea0519eb 100755 --- a/scripts/kconfig/streamline_config.pl +++ b/scripts/kconfig/streamline_config.pl @@ -702,3 +702,5 @@ foreach my $module (keys(%modules)) { print STDERR "\n"; } } + +# vim: softtabstop=4 -- cgit From c1c9142004e7e21d6d3d2cd6a339845771ce6a27 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 28 Jun 2021 19:33:50 -0700 Subject: scripts/spelling.txt: add more spellings to spelling.txt Here are some of the more common spelling mistakes and typos that I've found while fixing up spelling mistakes in the kernel in the past few months. Link: https://lkml.kernel.org/r/20210514093655.8829-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 7b6a01291598..17fdc620d548 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -22,6 +22,7 @@ absolut||absolute absoulte||absolute acccess||access acceess||access +accelaration||acceleration acceleratoin||acceleration accelleration||acceleration accesing||accessing @@ -264,6 +265,7 @@ calucate||calculate calulate||calculate cancelation||cancellation cancle||cancel +canot||cannot capabilites||capabilities capabilties||capabilities capabilty||capability @@ -494,7 +496,10 @@ digial||digital dimention||dimension dimesions||dimensions diconnected||disconnected +disabed||disabled +disble||disable disgest||digest +disired||desired dispalying||displaying diplay||display directon||direction @@ -710,6 +715,7 @@ havind||having heirarchically||hierarchically heirarchy||hierarchy helpfull||helpful +hearbeat||heartbeat heterogenous||heterogeneous hexdecimal||hexadecimal hybernate||hibernate @@ -989,6 +995,7 @@ notications||notifications notifcations||notifications notifed||notified notity||notify +nubmer||number numebr||number numner||number obtaion||obtain @@ -1014,8 +1021,10 @@ ommiting||omitting ommitted||omitted onself||oneself ony||only +openning||opening operatione||operation opertaions||operations +opportunies||opportunities optionnal||optional optmizations||optimizations orientatied||orientated @@ -1111,6 +1120,7 @@ prefitler||prefilter preform||perform premption||preemption prepaired||prepared +prepate||prepare preperation||preparation preprare||prepare pressre||pressure @@ -1123,6 +1133,7 @@ privilaged||privileged privilage||privilege priviledge||privilege priviledges||privileges +privleges||privileges probaly||probably procceed||proceed proccesors||processors @@ -1167,6 +1178,7 @@ promixity||proximity psudo||pseudo psuedo||pseudo psychadelic||psychedelic +purgable||purgeable pwoer||power queing||queuing quering||querying @@ -1180,6 +1192,7 @@ receieve||receive recepient||recipient recevied||received receving||receiving +recievd||received recieved||received recieve||receive reciever||receiver @@ -1228,6 +1241,7 @@ reponse||response representaion||representation reqeust||request reqister||register +requed||requeued requestied||requested requiere||require requirment||requirement @@ -1332,6 +1346,7 @@ singal||signal singed||signed sleeped||slept sliped||slipped +softwade||software softwares||software soley||solely souce||source @@ -1510,6 +1525,7 @@ unintialized||uninitialized unitialized||uninitialized unkmown||unknown unknonw||unknown +unknouwn||unknown unknow||unknown unkown||unknown unamed||unnamed -- cgit From d98e4d95411bbde2220a7afa38dcc9c14d71acbe Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Mon, 28 Jun 2021 19:33:52 -0700 Subject: ntfs: fix validity check for file name attribute When checking the file name attribute, we want to ensure that it fits within the bounds of ATTR_RECORD. To do this, we should check that (attr record + file name offset + file name length) < (attr record + attr record length). However, the original check did not include the file name offset in the calculation. This means that corrupted on-disk metadata might not caught by the incorrect file name check, and lead to an invalid memory access. An example can be seen in the crash report of a memory corruption error found by Syzbot: https://syzkaller.appspot.com/bug?id=a1a1e379b225812688566745c3e2f7242bffc246 Adding the file name offset to the validity check fixes this error and passes the Syzbot reproducer test. Link: https://lkml.kernel.org/r/20210614050540.289494-1-desmondcheongzx@gmail.com Signed-off-by: Desmond Cheong Zhi Xi Reported-by: syzbot+213ac8bb98f7f4420840@syzkaller.appspotmail.com Tested-by: syzbot+213ac8bb98f7f4420840@syzkaller.appspotmail.com Acked-by: Anton Altaparmakov Cc: Shuah Khan Cc: Greg Kroah-Hartman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index f5c058b3192c..4474adb393ca 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -477,7 +477,7 @@ err_corrupt_attr: } file_name_attr = (FILE_NAME_ATTR*)((u8*)attr + le16_to_cpu(attr->data.resident.value_offset)); - p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length); + p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length); if (p2 < (u8*)attr || p2 > p) goto err_corrupt_attr; /* This attribute is ok, but is it in the $Extend directory? */ -- cgit From 10dde05b89980ef147f590d2735d7dd53aa39c88 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Mon, 28 Jun 2021 19:33:55 -0700 Subject: squashfs: add option to panic on errors Add an errors=panic mount option to make squashfs trigger a panic when errors are encountered, similar to several other filesystems. This allows a kernel dump to be saved using which the corruption can be analysed and debugged. Inspired by a pre-fs_context patch by Anton Eliasson. Link: https://lkml.kernel.org/r/20210527125019.14511-1-vincent.whitchurch@axis.com Signed-off-by: Vincent Whitchurch Signed-off-by: Phillip Lougher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/squashfs/block.c | 5 ++- fs/squashfs/squashfs_fs_sb.h | 1 + fs/squashfs/super.c | 86 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index b9e87ebb1060..855f0e87066d 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -226,8 +226,11 @@ out_free_bio: bio_free_pages(bio); bio_put(bio); out: - if (res < 0) + if (res < 0) { ERROR("Failed to read block 0x%llx: %d\n", index, res); + if (msblk->panic_on_errors) + panic("squashfs read failed"); + } return res; } diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 166e98806265..1e90c2575f9b 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -65,5 +65,6 @@ struct squashfs_sb_info { unsigned int fragments; int xattr_ids; unsigned int ids; + bool panic_on_errors; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 88cc94be1076..60d6951915f4 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -18,9 +18,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -37,6 +39,51 @@ static struct file_system_type squashfs_fs_type; static const struct super_operations squashfs_super_ops; +enum Opt_errors { + Opt_errors_continue, + Opt_errors_panic, +}; + +enum squashfs_param { + Opt_errors, +}; + +struct squashfs_mount_opts { + enum Opt_errors errors; +}; + +static const struct constant_table squashfs_param_errors[] = { + {"continue", Opt_errors_continue }, + {"panic", Opt_errors_panic }, + {} +}; + +static const struct fs_parameter_spec squashfs_fs_parameters[] = { + fsparam_enum("errors", Opt_errors, squashfs_param_errors), + {} +}; + +static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct squashfs_mount_opts *opts = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, squashfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_errors: + opts->errors = result.uint_32; + break; + default: + return -EINVAL; + } + + return 0; +} + static const struct squashfs_decompressor *supported_squashfs_filesystem( struct fs_context *fc, short major, short minor, short id) @@ -67,6 +114,7 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct squashfs_mount_opts *opts = fc->fs_private; struct squashfs_sb_info *msblk; struct squashfs_super_block *sblk = NULL; struct inode *root; @@ -85,6 +133,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) } msblk = sb->s_fs_info; + msblk->panic_on_errors = (opts->errors == Opt_errors_panic); + msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); @@ -350,18 +400,52 @@ static int squashfs_get_tree(struct fs_context *fc) static int squashfs_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct squashfs_mount_opts *opts = fc->fs_private; + sync_filesystem(fc->root->d_sb); fc->sb_flags |= SB_RDONLY; + + msblk->panic_on_errors = (opts->errors == Opt_errors_panic); + return 0; } +static void squashfs_free_fs_context(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + static const struct fs_context_operations squashfs_context_ops = { .get_tree = squashfs_get_tree, + .free = squashfs_free_fs_context, + .parse_param = squashfs_parse_param, .reconfigure = squashfs_reconfigure, }; +static int squashfs_show_options(struct seq_file *s, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + + if (msblk->panic_on_errors) + seq_puts(s, ",errors=panic"); + else + seq_puts(s, ",errors=continue"); + + return 0; +} + static int squashfs_init_fs_context(struct fs_context *fc) { + struct squashfs_mount_opts *opts; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + fc->fs_private = opts; fc->ops = &squashfs_context_ops; return 0; } @@ -481,6 +565,7 @@ static struct file_system_type squashfs_fs_type = { .owner = THIS_MODULE, .name = "squashfs", .init_fs_context = squashfs_init_fs_context, + .parameters = squashfs_fs_parameters, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV }; @@ -491,6 +576,7 @@ static const struct super_operations squashfs_super_ops = { .free_inode = squashfs_free_inode, .statfs = squashfs_statfs, .put_super = squashfs_put_super, + .show_options = squashfs_show_options, }; module_init(init_squashfs_fs); -- cgit From 74ef829e41be8ada93e1d1dfa681c11be338c8d5 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 28 Jun 2021 19:33:58 -0700 Subject: ocfs2: remove unnecessary INIT_LIST_HEAD() The list_head o2hb_node_events is initialized statically. It is unnecessary to initialize by INIT_LIST_HEAD(). Link: https://lkml.kernel.org/r/20210511115847.3817395-1-yangyingliang@huawei.com Signed-off-by: Yang Yingliang Reported-by: Hulk Robot Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index e829c2595543..1169c8dc9106 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1442,8 +1442,6 @@ void o2hb_init(void) for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) INIT_LIST_HEAD(&o2hb_live_slots[i]); - INIT_LIST_HEAD(&o2hb_node_events); - memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); -- cgit From 54e948c60cc843b6e84dc44496edc91f51d2a28e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 28 Jun 2021 19:34:01 -0700 Subject: ocfs2: fix snprintf() checking The snprintf() function returns the number of bytes which would have been printed if the buffer was large enough. In other words it can return ">= remain" but this code assumes it returns "== remain". The run time impact of this bug is not very severe. The next iteration through the loop would trigger a WARN() when we pass a negative limit to snprintf(). We would then return success instead of -E2BIG. The kernel implementation of snprintf() will never return negatives so there is no need to check and I have deleted that dead code. Link: https://lkml.kernel.org/r/20210511135350.GV1955@kadam Fixes: a860f6eb4c6a ("ocfs2: sysfile interfaces for online file check") Fixes: 74ae4e104dfc ("ocfs2: Create stack glue sysfs files.") Signed-off-by: Dan Carpenter Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/filecheck.c | 6 +----- fs/ocfs2/stackglue.c | 8 ++------ 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 90b8d300c1ee..de56e6231af8 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -326,11 +326,7 @@ static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n", p->fe_ino, p->fe_done, ocfs2_filecheck_error(p->fe_status)); - if (ret < 0) { - total = ret; - break; - } - if (ret == remain) { + if (ret >= remain) { /* snprintf() didn't fit */ total = -E2BIG; break; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index d50e8b8dfea4..16f1bfc407f2 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -500,11 +500,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, list_for_each_entry(p, &ocfs2_stack_list, sp_list) { ret = snprintf(buf, remain, "%s\n", p->sp_name); - if (ret < 0) { - total = ret; - break; - } - if (ret == remain) { + if (ret >= remain) { /* snprintf() didn't fit */ total = -E2BIG; break; @@ -531,7 +527,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, if (active_stack) { ret = snprintf(buf, PAGE_SIZE, "%s\n", active_stack->sp_name); - if (ret == PAGE_SIZE) + if (ret >= PAGE_SIZE) ret = -E2BIG; } spin_unlock(&ocfs2_stack_lock); -- cgit From ca49b6d856ebde1e795e8bee37c461bff9939e02 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 28 Jun 2021 19:34:05 -0700 Subject: ocfs2: remove redundant assignment to pointer queue The pointer queue is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Link: https://lkml.kernel.org/r/20210513113957.57539-1-colin.king@canonical.com Signed-off-by: Colin Ian King Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 4960a6de768d..9b88219febb5 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2977,7 +2977,7 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { enum dlm_lockres_list idx; - struct list_head *queue = &res->granted; + struct list_head *queue; struct dlm_lock *lock; int noderef; u8 nodenum = O2NM_MAX_NODES; -- cgit From 01f01399136ca290e20cb21839c32a52ce626d16 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Mon, 28 Jun 2021 19:34:08 -0700 Subject: ocfs2: remove repeated uptodate check for buffer In commit 60f91826ca62 ("buffer: Avoid setting buffer bits that are already set"), function set_buffer_##name was added a test_bit() to check buffer, which is the same as function buffer_##name. The !buffer_uptodate(bh) here is a repeated check. Remove it. Link: https://lkml.kernel.org/r/20210425025702.13628-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 1294925ac94a..e1c6fa5bd0e7 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -632,8 +632,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, } if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_new(bh) && ocfs2_should_read_blk(inode, page, block_start) && -- cgit From f0f798db0586b34c552997f8c9d923a5db21fe5e Mon Sep 17 00:00:00 2001 From: Chen Huang Date: Mon, 28 Jun 2021 19:34:11 -0700 Subject: ocfs2: replace simple_strtoull() with kstrtoull() simple_strtoull() is deprecated in some situation since it does not check for the range overflow, use kstrtoull() instead. Link: https://lkml.kernel.org/r/20210526092020.554341-3-chenhuang5@huawei.com Signed-off-by: Chen Huang Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 1169c8dc9106..f89ffcbd585f 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1596,12 +1596,13 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item, struct o2hb_region *reg = to_o2hb_region(item); unsigned long long tmp; char *p = (char *)page; + ssize_t ret; if (reg->hr_bdev) return -EINVAL; - tmp = simple_strtoull(p, &p, 0); - if (!p || (*p && (*p != '\n'))) + ret = kstrtoull(p, 0, &tmp); + if (ret) return -EINVAL; reg->hr_start_block = tmp; -- cgit From 7ed6d4e418d98e78c9f2b895d76cdaed7a7ccbdb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 28 Jun 2021 19:34:14 -0700 Subject: ocfs2: remove redundant initialization of variable ret The variable ret is being initialized with a value that is never read, the assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Link: https://lkml.kernel.org/r/20210613135148.74658-1-colin.king@canonical.com Signed-off-by: Colin Ian King Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/nodemanager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index bb82e6b1ff4e..625c92521416 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -824,7 +824,7 @@ static void __exit exit_o2nm(void) static int __init init_o2nm(void) { - int ret = -1; + int ret; o2hb_init(); -- cgit From b124ac45bda0338f2aa3969e7c135139267f8987 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Mon, 28 Jun 2021 19:34:17 -0700 Subject: kernel: watchdog: modify the explanation related to watchdog thread The watchdog thread has been replaced by cpu_stop_work, modify the explanation related. Link: https://lkml.kernel.org/r/1619687073-24686-2-git-send-email-wangqing@vivo.com Signed-off-by: Wang Qing Reviewed-by: Petr Mladek Cc: Jonathan Corbet Cc: Mauro Carvalho Chehab Cc: Joe Perches Cc: Stephen Kitt Cc: Kees Cook Cc: Randy Dunlap Cc: "Guilherme G. Piccoli" Cc: Qais Yousef Cc: Santosh Sivaraj Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 92d3bcc5a5e0..ad912511a0c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -92,7 +92,7 @@ __setup("nmi_watchdog=", hardlockup_panic_setup); * own hardlockup detector. * * watchdog_nmi_enable/disable can be implemented to start and stop when - * softlockup watchdog threads start and stop. The arch must select the + * softlockup watchdog start and stop. The arch must select the * SOFTLOCKUP_DETECTOR Kconfig. */ int __weak watchdog_nmi_enable(unsigned int cpu) @@ -335,7 +335,7 @@ static DEFINE_PER_CPU(struct completion, softlockup_completion); static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); /* - * The watchdog thread function - touches the timestamp. + * The watchdog feed function - touches the timestamp. * * It only runs once every sample_period seconds (4 seconds by * default) to reset the softlockup timestamp. If this gets delayed @@ -558,11 +558,7 @@ static void lockup_detector_reconfigure(void) } /* - * Create the watchdog thread infrastructure and configure the detector(s). - * - * The threads are not unparked as watchdog_allowed_mask is empty. When - * the threads are successfully initialized, take the proper locks and - * unpark the threads in the watchdog_cpumask if the watchdog is enabled. + * Create the watchdog infrastructure and configure the detector(s). */ static __init void lockup_detector_setup(void) { @@ -628,7 +624,7 @@ void lockup_detector_soft_poweroff(void) #ifdef CONFIG_SYSCTL -/* Propagate any changes to the watchdog threads */ +/* Propagate any changes to the watchdog infrastructure */ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ -- cgit From e55fda8cdcba2cb3d5d46ae5fcd5f243f8b70d6e Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Mon, 28 Jun 2021 19:34:20 -0700 Subject: doc: watchdog: modify the explanation related to watchdog thread "watchdog/%u" threads has be replaced by cpu_stop_work. The current description is extremely misleading. Link: https://lkml.kernel.org/r/1619687073-24686-4-git-send-email-wangqing@vivo.com Signed-off-by: Wang Qing Reviewed-by: Petr Mladek Cc: "Guilherme G. Piccoli" Cc: Joe Perches Cc: Jonathan Corbet Cc: Kees Cook Cc: Mauro Carvalho Chehab Cc: Qais Yousef Cc: Randy Dunlap Cc: Santosh Sivaraj Cc: Stephen Kitt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/lockup-watchdogs.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/lockup-watchdogs.rst b/Documentation/admin-guide/lockup-watchdogs.rst index 290840c160af..3e09284a8b9b 100644 --- a/Documentation/admin-guide/lockup-watchdogs.rst +++ b/Documentation/admin-guide/lockup-watchdogs.rst @@ -39,7 +39,7 @@ in principle, they should work in any architecture where these subsystems are present. A periodic hrtimer runs to generate interrupts and kick the watchdog -task. An NMI perf event is generated every "watchdog_thresh" +job. An NMI perf event is generated every "watchdog_thresh" (compile-time initialized to 10 and configurable through sysctl of the same name) seconds to check for hardlockups. If any CPU in the system does not receive any hrtimer interrupt during that time the @@ -47,7 +47,7 @@ does not receive any hrtimer interrupt during that time the generate a kernel warning or call panic, depending on the configuration. -The watchdog task is a high priority kernel thread that updates a +The watchdog job runs in a stop scheduling thread that updates a timestamp every time it is scheduled. If that timestamp is not updated for 2*watchdog_thresh seconds (the softlockup threshold) the 'softlockup detector' (coded inside the hrtimer callback function) -- cgit From 256f7a6791e8f19bafa1d702f69a6a6ba16250e3 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Mon, 28 Jun 2021 19:34:24 -0700 Subject: doc: watchdog: modify the doc related to "watchdog/%u" "watchdog/%u" threads has be replaced by cpu_stop_work. The current description is extremely misleading. Link: https://lkml.kernel.org/r/1619687073-24686-5-git-send-email-wangqing@vivo.com Signed-off-by: Wang Qing Reviewed-by: Petr Mladek Cc: "Guilherme G. Piccoli" Cc: Joe Perches Cc: Jonathan Corbet Cc: Kees Cook Cc: Mauro Carvalho Chehab Cc: Qais Yousef Cc: Randy Dunlap Cc: Santosh Sivaraj Cc: Stephen Kitt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/kernel.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 68b21395a743..04c79017814b 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1283,11 +1283,11 @@ This parameter can be used to control the soft lockup detector. = ================================= The soft lockup detector monitors CPUs for threads that are hogging the CPUs -without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads -from running. The mechanism depends on the CPUs ability to respond to timer -interrupts which are needed for the 'watchdog/N' threads to be woken up by -the watchdog timer function, otherwise the NMI watchdog — if enabled — can -detect a hard lockup condition. +without rescheduling voluntarily, and thus prevent the 'migration/N' threads +from running, causing the watchdog work fail to execute. The mechanism depends +on the CPUs ability to respond to timer interrupts which are needed for the +watchdog work to be queued by the watchdog timer function, otherwise the NMI +watchdog — if enabled — can detect a hard lockup condition. stack_erasing -- cgit From 4acaa7d5045e21d5469232d0e6e79cdaf6755754 Mon Sep 17 00:00:00 2001 From: gumingtao Date: Mon, 28 Jun 2021 19:34:27 -0700 Subject: slab: use __func__ to trace function name It is better to use __func__ to trace function name. Link: https://lkml.kernel.org/r/31fdbad5c45cd1e26be9ff37be321b8586b80fee.1624355507.git.gumingtao@xiaomi.com Signed-off-by: gumingtao Acked-by: Christoph Lameter Acked-by: David Rientjes Reviewed-by: Aaron Tomlin Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 7cab77655f11..1ded52592b56 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -377,11 +377,11 @@ out_unlock: if (err) { if (flags & SLAB_PANIC) - panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", - name, err); + panic("%s: Failed to create slab '%s'. Error %d\n", + __func__, name, err); else { - pr_warn("kmem_cache_create(%s) failed with error %d\n", - name, err); + pr_warn("%s(%s) failed with error %d\n", + __func__, name, err); dump_stack(); } return NULL; @@ -508,8 +508,8 @@ void kmem_cache_destroy(struct kmem_cache *s) err = shutdown_cache(s); if (err) { - pr_err("kmem_cache_destroy %s: Slab cache still has objects\n", - s->name); + pr_err("%s %s: Slab cache still has objects\n", + __func__, s->name); dump_stack(); } out_unlock: -- cgit From 26c6cb7cf830349c6518a7efe1c32ac796cd192e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 28 Jun 2021 19:34:30 -0700 Subject: kunit: make test->lock irq safe The upcoming SLUB kunit test will be calling kunit_find_named_resource() from a context with disabled interrupts. That means kunit's test->lock needs to be IRQ safe to avoid potential deadlocks and lockdep splats. This patch therefore changes the test->lock usage to spin_lock_irqsave() and spin_unlock_irqrestore(). Link: https://lkml.kernel.org/r/20210511150734.3492-1-glittao@gmail.com Signed-off-by: Vlastimil Babka Signed-off-by: Oliver Glitta Reviewed-by: Brendan Higgins Cc: Christoph Lameter Cc: Daniel Latypov Cc: David Rientjes Cc: Joonsoo Kim Cc: Marco Elver Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/kunit/test.h | 5 +++-- lib/kunit/test.c | 18 +++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/kunit/test.h b/include/kunit/test.h index 49601c4b98b8..524d4789af22 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -515,8 +515,9 @@ kunit_find_resource(struct kunit *test, void *match_data) { struct kunit_resource *res, *found = NULL; + unsigned long flags; - spin_lock(&test->lock); + spin_lock_irqsave(&test->lock, flags); list_for_each_entry_reverse(res, &test->resources, node) { if (match(test, res, (void *)match_data)) { @@ -526,7 +527,7 @@ kunit_find_resource(struct kunit *test, } } - spin_unlock(&test->lock); + spin_unlock_irqrestore(&test->lock, flags); return found; } diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 2f6cc0123232..45f068864d76 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -475,6 +475,7 @@ int kunit_add_resource(struct kunit *test, void *data) { int ret = 0; + unsigned long flags; res->free = free; kref_init(&res->refcount); @@ -487,10 +488,10 @@ int kunit_add_resource(struct kunit *test, res->data = data; } - spin_lock(&test->lock); + spin_lock_irqsave(&test->lock, flags); list_add_tail(&res->node, &test->resources); /* refcount for list is established by kref_init() */ - spin_unlock(&test->lock); + spin_unlock_irqrestore(&test->lock, flags); return ret; } @@ -548,9 +549,11 @@ EXPORT_SYMBOL_GPL(kunit_alloc_and_get_resource); void kunit_remove_resource(struct kunit *test, struct kunit_resource *res) { - spin_lock(&test->lock); + unsigned long flags; + + spin_lock_irqsave(&test->lock, flags); list_del(&res->node); - spin_unlock(&test->lock); + spin_unlock_irqrestore(&test->lock, flags); kunit_put_resource(res); } EXPORT_SYMBOL_GPL(kunit_remove_resource); @@ -630,6 +633,7 @@ EXPORT_SYMBOL_GPL(kunit_kfree); void kunit_cleanup(struct kunit *test) { struct kunit_resource *res; + unsigned long flags; /* * test->resources is a stack - each allocation must be freed in the @@ -641,9 +645,9 @@ void kunit_cleanup(struct kunit *test) * protect against the current node being deleted, not the next. */ while (true) { - spin_lock(&test->lock); + spin_lock_irqsave(&test->lock, flags); if (list_empty(&test->resources)) { - spin_unlock(&test->lock); + spin_unlock_irqrestore(&test->lock, flags); break; } res = list_last_entry(&test->resources, @@ -654,7 +658,7 @@ void kunit_cleanup(struct kunit *test) * resource, and this can't happen if the test->lock * is held. */ - spin_unlock(&test->lock); + spin_unlock_irqrestore(&test->lock, flags); kunit_remove_resource(test, res); } current->kunit_test = NULL; -- cgit From 1f9f78b1b376f82cdd8ed73cc0abdb74d0453d43 Mon Sep 17 00:00:00 2001 From: Oliver Glitta Date: Mon, 28 Jun 2021 19:34:33 -0700 Subject: mm/slub, kunit: add a KUnit test for SLUB debugging functionality SLUB has resiliency_test() function which is hidden behind #ifdef SLUB_RESILIENCY_TEST that is not part of Kconfig, so nobody runs it. KUnit should be a proper replacement for it. Try changing byte in redzone after allocation and changing pointer to next free node, first byte, 50th byte and redzone byte. Check if validation finds errors. There are several differences from the original resiliency test: Tests create own caches with known state instead of corrupting shared kmalloc caches. The corruption of freepointer uses correct offset, the original resiliency test got broken with freepointer changes. Scratch changing random byte test, because it does not have meaning in this form where we need deterministic results. Add new option CONFIG_SLUB_KUNIT_TEST in Kconfig. Tests next_pointer, first_word and clobber_50th_byte do not run with KASAN option on. Because the test deliberately modifies non-allocated objects. Use kunit_resource to count errors in cache and silence bug reports. Count error whenever slab_bug() or slab_fix() is called or when the count of pages is wrong. [glittao@gmail.com: remove unused function test_exit(), from SLUB KUnit test] Link: https://lkml.kernel.org/r/20210512140656.12083-1-glittao@gmail.com [akpm@linux-foundation.org: export kasan_enable/disable_current to modules] Link: https://lkml.kernel.org/r/20210511150734.3492-2-glittao@gmail.com Signed-off-by: Oliver Glitta Reviewed-by: Vlastimil Babka Acked-by: Daniel Latypov Acked-by: Marco Elver Cc: Brendan Higgins Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 12 +++++ lib/Makefile | 1 + lib/slub_kunit.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/kasan/common.c | 3 ++ mm/slab.h | 1 + mm/slub.c | 46 +++++++++++++++-- 6 files changed, 212 insertions(+), 3 deletions(-) create mode 100644 lib/slub_kunit.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 678c13967580..7723f58a9394 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2429,6 +2429,18 @@ config BITS_TEST If unsure, say N. +config SLUB_KUNIT_TEST + tristate "KUnit test for SLUB cache error detection" if !KUNIT_ALL_TESTS + depends on SLUB_DEBUG && KUNIT + default KUNIT_ALL_TESTS + help + This builds SLUB allocator unit test. + Tests SLUB cache debugging functionality. + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + config TEST_UDELAY tristate "udelay test driver" help diff --git a/lib/Makefile b/lib/Makefile index 2cc359ec1fdd..6d5ea8f5b52c 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -354,5 +354,6 @@ obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o obj-$(CONFIG_BITS_TEST) += test_bits.o obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o +obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c new file mode 100644 index 000000000000..8662dc6cb509 --- /dev/null +++ b/lib/slub_kunit.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include "../mm/slab.h" + +static struct kunit_resource resource; +static int slab_errors; + +static void test_clobber_zone(struct kunit *test) +{ + struct kmem_cache *s = kmem_cache_create("TestSlub_RZ_alloc", 64, 0, + SLAB_RED_ZONE, NULL); + u8 *p = kmem_cache_alloc(s, GFP_KERNEL); + + kasan_disable_current(); + p[64] = 0x12; + + validate_slab_cache(s); + KUNIT_EXPECT_EQ(test, 2, slab_errors); + + kasan_enable_current(); + kmem_cache_free(s, p); + kmem_cache_destroy(s); +} + +#ifndef CONFIG_KASAN +static void test_next_pointer(struct kunit *test) +{ + struct kmem_cache *s = kmem_cache_create("TestSlub_next_ptr_free", 64, 0, + SLAB_POISON, NULL); + u8 *p = kmem_cache_alloc(s, GFP_KERNEL); + unsigned long tmp; + unsigned long *ptr_addr; + + kmem_cache_free(s, p); + + ptr_addr = (unsigned long *)(p + s->offset); + tmp = *ptr_addr; + p[s->offset] = 0x12; + + /* + * Expecting three errors. + * One for the corrupted freechain and the other one for the wrong + * count of objects in use. The third error is fixing broken cache. + */ + validate_slab_cache(s); + KUNIT_EXPECT_EQ(test, 3, slab_errors); + + /* + * Try to repair corrupted freepointer. + * Still expecting two errors. The first for the wrong count + * of objects in use. + * The second error is for fixing broken cache. + */ + *ptr_addr = tmp; + slab_errors = 0; + + validate_slab_cache(s); + KUNIT_EXPECT_EQ(test, 2, slab_errors); + + /* + * Previous validation repaired the count of objects in use. + * Now expecting no error. + */ + slab_errors = 0; + validate_slab_cache(s); + KUNIT_EXPECT_EQ(test, 0, slab_errors); + + kmem_cache_destroy(s); +} + +static void test_first_word(struct kunit *test) +{ + struct kmem_cache *s = kmem_cache_create("TestSlub_1th_word_free", 64, 0, + SLAB_POISON, NULL); + u8 *p = kmem_cache_alloc(s, GFP_KERNEL); + + kmem_cache_free(s, p); + *p = 0x78; + + validate_slab_cache(s); + KUNIT_EXPECT_EQ(test, 2, slab_errors); + + kmem_cache_destroy(s); +} + +static void test_clobber_50th_byte(struct kunit *test) +{ + struct kmem_cache *s = kmem_cache_create("TestSlub_50th_word_free", 64, 0, + SLAB_POISON, NULL); + u8 *p = kmem_cache_alloc(s, GFP_KERNEL); + + kmem_cache_free(s, p); + p[50] = 0x9a; + + validate_slab_cache(s); + KUNIT_EXPECT_EQ(test, 2, slab_errors); + + kmem_cache_destroy(s); +} +#endif + +static void test_clobber_redzone_free(struct kunit *test) +{ + struct kmem_cache *s = kmem_cache_create("TestSlub_RZ_free", 64, 0, + SLAB_RED_ZONE, NULL); + u8 *p = kmem_cache_alloc(s, GFP_KERNEL); + + kasan_disable_current(); + kmem_cache_free(s, p); + p[64] = 0xab; + + validate_slab_cache(s); + KUNIT_EXPECT_EQ(test, 2, slab_errors); + + kasan_enable_current(); + kmem_cache_destroy(s); +} + +static int test_init(struct kunit *test) +{ + slab_errors = 0; + + kunit_add_named_resource(test, NULL, NULL, &resource, + "slab_errors", &slab_errors); + return 0; +} + +static struct kunit_case test_cases[] = { + KUNIT_CASE(test_clobber_zone), + +#ifndef CONFIG_KASAN + KUNIT_CASE(test_next_pointer), + KUNIT_CASE(test_first_word), + KUNIT_CASE(test_clobber_50th_byte), +#endif + + KUNIT_CASE(test_clobber_redzone_free), + {} +}; + +static struct kunit_suite test_suite = { + .name = "slub_test", + .init = test_init, + .test_cases = test_cases, +}; +kunit_test_suite(test_suite); + +MODULE_LICENSE("GPL"); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6bb87f2acd4e..2586d3718600 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -51,11 +51,14 @@ void kasan_enable_current(void) { current->kasan_depth++; } +EXPORT_SYMBOL(kasan_enable_current); void kasan_disable_current(void) { current->kasan_depth--; } +EXPORT_SYMBOL(kasan_disable_current); + #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ void __kasan_unpoison_range(const void *address, size_t size) diff --git a/mm/slab.h b/mm/slab.h index 18c1927cd196..9b690fa44cae 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -215,6 +215,7 @@ DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); DECLARE_STATIC_KEY_FALSE(slub_debug_enabled); #endif extern void print_tracking(struct kmem_cache *s, void *object); +long validate_slab_cache(struct kmem_cache *s); #else static inline void print_tracking(struct kmem_cache *s, void *object) { diff --git a/mm/slub.c b/mm/slub.c index 61bd40e3eb9a..290444ab932b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -449,6 +450,26 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); +#if IS_ENABLED(CONFIG_KUNIT) +static bool slab_add_kunit_errors(void) +{ + struct kunit_resource *resource; + + if (likely(!current->kunit_test)) + return false; + + resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); + if (!resource) + return false; + + (*(int *)resource->data)++; + kunit_put_resource(resource); + return true; +} +#else +static inline bool slab_add_kunit_errors(void) { return false; } +#endif + /* * Determine a map of object in use on a page. * @@ -679,6 +700,9 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) struct va_format vaf; va_list args; + if (slab_add_kunit_errors()) + return; + va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -742,6 +766,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { + if (slab_add_kunit_errors()) + return; + slab_bug(s, "%s", reason); print_trailer(s, page, object); } @@ -752,6 +779,9 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, va_list args; char buf[100]; + if (slab_add_kunit_errors()) + return; + va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); @@ -801,12 +831,16 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, while (end > fault && end[-1] == value) end--; + if (slab_add_kunit_errors()) + goto skip_bug_print; + slab_bug(s, "%s overwritten", what); pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); +skip_bug_print: restore_bytes(s, what, value, fault, end); return 0; } @@ -4649,9 +4683,11 @@ static int validate_slab_node(struct kmem_cache *s, validate_slab(s, page); count++; } - if (count != n->nr_partial) + if (count != n->nr_partial) { pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", s->name, count, n->nr_partial); + slab_add_kunit_errors(); + } if (!(s->flags & SLAB_STORE_USER)) goto out; @@ -4660,16 +4696,18 @@ static int validate_slab_node(struct kmem_cache *s, validate_slab(s, page); count++; } - if (count != atomic_long_read(&n->nr_slabs)) + if (count != atomic_long_read(&n->nr_slabs)) { pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", s->name, count, atomic_long_read(&n->nr_slabs)); + slab_add_kunit_errors(); + } out: spin_unlock_irqrestore(&n->list_lock, flags); return count; } -static long validate_slab_cache(struct kmem_cache *s) +long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; @@ -4681,6 +4719,8 @@ static long validate_slab_cache(struct kmem_cache *s) return count; } +EXPORT_SYMBOL(validate_slab_cache); + /* * Generate lists of code addresses where slabcache objects are allocated * and freed. -- cgit From 3d8e374c6d46a648333b9ef87983bc726f8e56bc Mon Sep 17 00:00:00 2001 From: Oliver Glitta Date: Mon, 28 Jun 2021 19:34:36 -0700 Subject: slub: remove resiliency_test() function Function resiliency_test() is hidden behind #ifdef SLUB_RESILIENCY_TEST that is not part of Kconfig, so nobody runs it. This function is replaced with KUnit test for SLUB added by the previous patch "selftests: add a KUnit test for SLUB debugging functionality". Link: https://lkml.kernel.org/r/20210511150734.3492-3-glittao@gmail.com Signed-off-by: Oliver Glitta Reviewed-by: Marco Elver Acked-by: Vlastimil Babka Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Oliver Glitta Cc: Brendan Higgins Cc: Daniel Latypov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 64 --------------------------------------------------------------- 1 file changed, 64 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 290444ab932b..13857910055b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -155,9 +155,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) * - Variable sizing of the per node arrays */ -/* Enable to test recovery from slab corruption on boot */ -#undef SLUB_RESILIENCY_TEST - /* Enable to log cmpxchg failures */ #undef SLUB_DEBUG_CMPXCHG @@ -4938,66 +4935,6 @@ static int list_locations(struct kmem_cache *s, char *buf, } #endif /* CONFIG_SLUB_DEBUG */ -#ifdef SLUB_RESILIENCY_TEST -static void __init resiliency_test(void) -{ - u8 *p; - int type = KMALLOC_NORMAL; - - BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); - - pr_err("SLUB resiliency testing\n"); - pr_err("-----------------------\n"); - pr_err("A. Corruption after allocation\n"); - - p = kzalloc(16, GFP_KERNEL); - p[16] = 0x12; - pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", - p + 16); - - validate_slab_cache(kmalloc_caches[type][4]); - - /* Hmmm... The next two are dangerous */ - p = kzalloc(32, GFP_KERNEL); - p[32 + sizeof(void *)] = 0x34; - pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", - p); - pr_err("If allocated object is overwritten then not detectable\n\n"); - - validate_slab_cache(kmalloc_caches[type][5]); - p = kzalloc(64, GFP_KERNEL); - p += 64 + (get_cycles() & 0xff) * sizeof(void *); - *p = 0x56; - pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - pr_err("If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[type][6]); - - pr_err("\nB. Corruption after free\n"); - p = kzalloc(128, GFP_KERNEL); - kfree(p); - *p = 0x78; - pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[type][7]); - - p = kzalloc(256, GFP_KERNEL); - kfree(p); - p[50] = 0x9a; - pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[type][8]); - - p = kzalloc(512, GFP_KERNEL); - kfree(p); - p[512] = 0xab; - pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[type][9]); -} -#else -#ifdef CONFIG_SYSFS -static void resiliency_test(void) {}; -#endif -#endif /* SLUB_RESILIENCY_TEST */ - #ifdef CONFIG_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ @@ -5846,7 +5783,6 @@ static int __init slab_sysfs_init(void) } mutex_unlock(&slab_mutex); - resiliency_test(); return 0; } -- cgit From 588c7fa022d7b2361500ead5660d9a1a2ecd9b7d Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Mon, 28 Jun 2021 19:34:39 -0700 Subject: mm, slub: change run-time assertion in kmalloc_index() to compile-time Currently when size is not supported by kmalloc_index, compiler will generate a run-time BUG() while compile-time error is also possible, and better. So change BUG to BUILD_BUG_ON_MSG to make compile-time check possible. Also remove code that allocates more than 32MB because current implementation supports only up to 32MB. [42.hyeyoo@gmail.com: fix support for clang 10] Link: https://lkml.kernel.org/r/20210518181247.GA10062@hyeyoo [vbabka@suse.cz: fix false-positive assert in kernel/bpf/local_storage.c] Link: https://lkml.kernel.org/r/bea97388-01df-8eac-091b-a3c89b4a4a09@suse.czLink: https://lkml.kernel.org/r/20210511173448.GA54466@hyeyoo [elver@google.com: kfence fix] Link: https://lkml.kernel.org/r/20210512195227.245000695c9014242e9a00e5@linux-foundation.org Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Signed-off-by: Marco Elver Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 17 ++++++++++++++--- mm/kfence/kfence_test.c | 5 +++-- mm/slab_common.c | 7 +++---- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 0c97d788762c..bc9ab3a5a017 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -346,8 +346,14 @@ static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) * 1 = 65 .. 96 bytes * 2 = 129 .. 192 bytes * n = 2^(n-1)+1 .. 2^n + * + * Note: __kmalloc_index() is compile-time optimized, and not runtime optimized; + * typical usage is via kmalloc_index() and therefore evaluated at compile-time. + * Callers where !size_is_constant should only be test modules, where runtime + * overheads of __kmalloc_index() can be tolerated. Also see kmalloc_slab(). */ -static __always_inline unsigned int kmalloc_index(size_t size) +static __always_inline unsigned int __kmalloc_index(size_t size, + bool size_is_constant) { if (!size) return 0; @@ -382,12 +388,17 @@ static __always_inline unsigned int kmalloc_index(size_t size) if (size <= 8 * 1024 * 1024) return 23; if (size <= 16 * 1024 * 1024) return 24; if (size <= 32 * 1024 * 1024) return 25; - if (size <= 64 * 1024 * 1024) return 26; - BUG(); + + if ((IS_ENABLED(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 110000) + && !IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) + BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()"); + else + BUG(); /* Will never be reached. Needed because the compiler may complain */ return -1; } +#define kmalloc_index(s) __kmalloc_index(s, true) #endif /* !CONFIG_SLOB */ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 4acf4251ee04..7f24b9bcb2ec 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -197,7 +197,7 @@ static void test_cache_destroy(void) static inline size_t kmalloc_cache_alignment(size_t size) { - return kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]->align; + return kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]->align; } /* Must always inline to match stack trace against caller. */ @@ -267,7 +267,8 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat if (is_kfence_address(alloc)) { struct page *page = virt_to_head_page(alloc); - struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]; + struct kmem_cache *s = test_cache ?: + kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]; /* * Verify that various helpers return the right values diff --git a/mm/slab_common.c b/mm/slab_common.c index 1ded52592b56..b97b6fa8a7c6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -754,8 +754,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. - * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is - * kmalloc-67108864. + * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is + * kmalloc-32M. */ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(0, 0), @@ -783,8 +783,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(4194304, 4M), INIT_KMALLOC_INFO(8388608, 8M), INIT_KMALLOC_INFO(16777216, 16M), - INIT_KMALLOC_INFO(33554432, 32M), - INIT_KMALLOC_INFO(67108864, 64M) + INIT_KMALLOC_INFO(33554432, 32M) }; /* -- cgit From 02ac47d0cdd48c0c859a6ac7a6fad49c8e413ce1 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 28 Jun 2021 19:34:43 -0700 Subject: slub: restore slub_debug=- behavior Petch series "slub: Print non-hashed pointers in slub debugging", v3. I was doing some debugging recently and noticed that my pointers were being hashed while slub_debug was on the kernel commandline. Let's force on the no hash pointer option when slub_debug is on the kernel commandline so that the prints are more meaningful. The first two patches are something else I noticed while looking at the code. The message argument is never used so the debugging messages are not as clear as they could be and the slub_debug=- behavior seems to be busted. Then there's a printf fixup from Joe and the final patch is the one that force disables pointer hashing. This patch (of 4): Passing slub_debug=- on the kernel commandline is supposed to disable slub debugging. This is especially useful with CONFIG_SLUB_DEBUG_ON where the default is to have slub debugging enabled in the build. Due to some code reorganization this behavior was dropped, but the code to make it work mostly stuck around. Restore the previous behavior by disabling the static key when we parse the commandline and see that we're trying to disable slub debugging. Link: https://lkml.kernel.org/r/20210601182202.3011020-1-swboyd@chromium.org Link: https://lkml.kernel.org/r/20210601182202.3011020-2-swboyd@chromium.org Fixes: ca0cab65ea2b ("mm, slub: introduce static key for slub_debug()") Signed-off-by: Stephen Boyd Acked-by: Vlastimil Babka Reviewed-by: Muchun Song Cc: Christoph Lameter Cc: David Rientjes Cc: Joe Perches Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Petr Mladek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 13857910055b..1de695200efe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1429,6 +1429,8 @@ static int __init setup_slub_debug(char *str) out: if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); + else + static_branch_disable(&slub_debug_enabled); if ((static_branch_unlikely(&init_on_alloc) || static_branch_unlikely(&init_on_free)) && (slub_debug & SLAB_POISON)) -- cgit From 1a88ef87f861e10611e9162c4c701704bfdeed85 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 28 Jun 2021 19:34:46 -0700 Subject: slub: actually use 'message' in restore_bytes() The message argument isn't used here. Let's pass the string to the printk message so that the developer can figure out what's happening, instead of guessing that a redzone is being restored, etc. Link: https://lkml.kernel.org/r/20210601182202.3011020-3-swboyd@chromium.org Signed-off-by: Stephen Boyd Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Reviewed-by: Muchun Song Cc: Christoph Lameter Cc: Joe Perches Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Petr Mladek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 1de695200efe..89a4045b917d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -806,7 +806,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { - slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); + slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x\n", message, from, to - 1, data); memset(from, data, to - from); } -- cgit From 582d1212edc73e6459d5219a24f312799877b61e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 28 Jun 2021 19:34:49 -0700 Subject: slub: indicate slab_fix() uses printf formats Ideally, slab_fix() would be marked with __printf and the format here would not use \n as that's emitted by the slab_fix(). Make these changes. Link: https://lkml.kernel.org/r/20210601182202.3011020-4-swboyd@chromium.org Signed-off-by: Joe Perches Acked-by: Vlastimil Babka Signed-off-by: Stephen Boyd Acked-by: David Rientjes Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Muchun Song Cc: Pekka Enberg Cc: Petr Mladek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 89a4045b917d..f8e4d37c4641 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -692,6 +692,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) va_end(args); } +__printf(2, 3) static void slab_fix(struct kmem_cache *s, char *fmt, ...) { struct va_format vaf; @@ -806,7 +807,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { - slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x\n", message, from, to - 1, data); + slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); memset(from, data, to - from); } @@ -1059,13 +1060,13 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) slab_err(s, page, "Wrong number of objects. Found %d but should be %d", page->objects, max_objects); page->objects = max_objects; - slab_fix(s, "Number of objects adjusted."); + slab_fix(s, "Number of objects adjusted"); } if (page->inuse != page->objects - nr) { slab_err(s, page, "Wrong object count. Counter is %d but counted were %d", page->inuse, page->objects - nr); page->inuse = page->objects - nr; - slab_fix(s, "Object count adjusted."); + slab_fix(s, "Object count adjusted"); } return search == NULL; } -- cgit From 792702911f581f7793962fbeb99d5c3a1b28f4c3 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Mon, 28 Jun 2021 19:34:52 -0700 Subject: slub: force on no_hash_pointers when slub_debug is enabled Obscuring the pointers that slub shows when debugging makes for some confusing slub debug messages: Padding overwritten. 0x0000000079f0674a-0x000000000d4dce17 Those addresses are hashed for kernel security reasons. If we're trying to be secure with slub_debug on the commandline we have some big problems given that we dump whole chunks of kernel memory to the kernel logs. Let's force on the no_hash_pointers commandline flag when slub_debug is on the commandline. This makes slub debug messages more meaningful and if by chance a kernel address is in some slub debug object dump we will have a better chance of figuring out what went wrong. Note that we don't use %px in the slub code because we want to reduce the number of places that %px is used in the kernel. This also nicely prints a big fat warning at kernel boot if slub_debug is on the commandline so that we know that this kernel shouldn't be used on production systems. [akpm@linux-foundation.org: fix build with CONFIG_SLUB_DEBUG=n] Link: https://lkml.kernel.org/r/20210601182202.3011020-5-swboyd@chromium.org Signed-off-by: Stephen Boyd Acked-by: Vlastimil Babka Acked-by: Petr Mladek Cc: Joe Perches Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 ++ lib/vsprintf.c | 2 +- mm/slub.c | 20 +++++++++++++++++++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 15d8bad3d2f2..bf950621febf 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -357,6 +357,8 @@ int sscanf(const char *, const char *, ...); extern __scanf(2, 0) int vsscanf(const char *, const char *, va_list); +extern int no_hash_pointers_enable(char *str); + extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index f0c35d9b65bf..cc281f5895f9 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2186,7 +2186,7 @@ char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode, bool no_hash_pointers __ro_after_init; EXPORT_SYMBOL_GPL(no_hash_pointers); -static int __init no_hash_pointers_enable(char *str) +int __init no_hash_pointers_enable(char *str) { if (no_hash_pointers) return 0; diff --git a/mm/slub.c b/mm/slub.c index f8e4d37c4641..4b2ba9c099c9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -118,12 +118,26 @@ */ #ifdef CONFIG_SLUB_DEBUG + #ifdef CONFIG_SLUB_DEBUG_ON DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); #else DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif -#endif + +static inline bool __slub_debug_enabled(void) +{ + return static_branch_unlikely(&slub_debug_enabled); +} + +#else /* CONFIG_SLUB_DEBUG */ + +static inline bool __slub_debug_enabled(void) +{ + return false; +} + +#endif /* CONFIG_SLUB_DEBUG */ static inline bool kmem_cache_debug(struct kmem_cache *s) { @@ -4487,6 +4501,10 @@ void __init kmem_cache_init(void) if (debug_guardpage_minorder()) slub_max_order = 0; + /* Print slub debugging pointers without hashing */ + if (__slub_debug_enabled()) + no_hash_pointers_enable(NULL); + kmem_cache_node = &boot_kmem_cache_node; kmem_cache = &boot_kmem_cache; -- cgit From 64dd68497be76ab4e237cca06f5324e220d0f050 Mon Sep 17 00:00:00 2001 From: Faiyaz Mohammed Date: Mon, 28 Jun 2021 19:34:55 -0700 Subject: mm: slub: move sysfs slab alloc/free interfaces to debugfs alloc_calls and free_calls implementation in sysfs have two issues, one is PAGE_SIZE limitation of sysfs and other is it does not adhere to "one value per file" rule. To overcome this issues, move the alloc_calls and free_calls implementation to debugfs. Debugfs cache will be created if SLAB_STORE_USER flag is set. Rename the alloc_calls/free_calls to alloc_traces/free_traces, to be inline with what it does. [faiyazm@codeaurora.org: fix the leak of alloc/free traces debugfs interface] Link: https://lkml.kernel.org/r/1624248060-30286-1-git-send-email-faiyazm@codeaurora.org Link: https://lkml.kernel.org/r/1623438200-19361-1-git-send-email-faiyazm@codeaurora.org Signed-off-by: Faiyaz Mohammed Reviewed-by: Vlastimil Babka Reviewed-by: Greg Kroah-Hartman Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 6 ++ mm/slab_common.c | 2 + mm/slub.c | 274 ++++++++++++++++++++++++++++++++++++------------------- 3 files changed, 189 insertions(+), 93 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 9b690fa44cae..7f9b4bd9fc65 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -631,6 +631,12 @@ static inline bool slab_want_init_on_free(struct kmem_cache *c) return false; } +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) +void debugfs_slab_release(struct kmem_cache *); +#else +static inline void debugfs_slab_release(struct kmem_cache *s) { } +#endif + #ifdef CONFIG_PRINTK #define KS_ADDRS_COUNT 16 struct kmem_obj_info { diff --git a/mm/slab_common.c b/mm/slab_common.c index b97b6fa8a7c6..6c0db9f9bd8a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -448,6 +448,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) rcu_barrier(); list_for_each_entry_safe(s, s2, &to_destroy, list) { + debugfs_slab_release(s); kfence_shutdown_cache(s); #ifdef SLAB_SUPPORTS_SYSFS sysfs_slab_release(s); @@ -475,6 +476,7 @@ static int shutdown_cache(struct kmem_cache *s) schedule_work(&slab_caches_to_rcu_destroy_work); } else { kfence_shutdown_cache(s); + debugfs_slab_release(s); #ifdef SLAB_SUPPORTS_SYSFS sysfs_slab_unlink(s); sysfs_slab_release(s); diff --git a/mm/slub.c b/mm/slub.c index 4b2ba9c099c9..70bb844a44d2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -38,6 +38,7 @@ #include #include +#include #include #include "internal.h" @@ -238,6 +239,12 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } #endif +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) +static void debugfs_slab_add(struct kmem_cache *); +#else +static inline void debugfs_slab_add(struct kmem_cache *s) { } +#endif + static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS @@ -4593,6 +4600,9 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) if (err) __kmem_cache_release(s); + if (s->flags & SLAB_STORE_USER) + debugfs_slab_add(s); + return err; } @@ -4739,6 +4749,7 @@ long validate_slab_cache(struct kmem_cache *s) } EXPORT_SYMBOL(validate_slab_cache); +#ifdef CONFIG_DEBUG_FS /* * Generate lists of code addresses where slabcache objects are allocated * and freed. @@ -4762,6 +4773,8 @@ struct loc_track { struct location *loc; }; +static struct dentry *slab_debugfs_root; + static void free_loc_track(struct loc_track *t) { if (t->max) @@ -4878,82 +4891,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, add_location(t, s, get_track(s, p, alloc)); put_map(map); } - -static int list_locations(struct kmem_cache *s, char *buf, - enum track_item alloc) -{ - int len = 0; - unsigned long i; - struct loc_track t = { 0, 0, NULL }; - int node; - struct kmem_cache_node *n; - - if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_KERNEL)) { - return sysfs_emit(buf, "Out of memory\n"); - } - /* Push back cpu slabs */ - flush_all(s); - - for_each_kmem_cache_node(s, node, n) { - unsigned long flags; - struct page *page; - - if (!atomic_long_read(&n->nr_slabs)) - continue; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, slab_list) - process_slab(&t, s, page, alloc); - list_for_each_entry(page, &n->full, slab_list) - process_slab(&t, s, page, alloc); - spin_unlock_irqrestore(&n->list_lock, flags); - } - - for (i = 0; i < t.count; i++) { - struct location *l = &t.loc[i]; - - len += sysfs_emit_at(buf, len, "%7ld ", l->count); - - if (l->addr) - len += sysfs_emit_at(buf, len, "%pS", (void *)l->addr); - else - len += sysfs_emit_at(buf, len, ""); - - if (l->sum_time != l->min_time) - len += sysfs_emit_at(buf, len, " age=%ld/%ld/%ld", - l->min_time, - (long)div_u64(l->sum_time, - l->count), - l->max_time); - else - len += sysfs_emit_at(buf, len, " age=%ld", l->min_time); - - if (l->min_pid != l->max_pid) - len += sysfs_emit_at(buf, len, " pid=%ld-%ld", - l->min_pid, l->max_pid); - else - len += sysfs_emit_at(buf, len, " pid=%ld", - l->min_pid); - - if (num_online_cpus() > 1 && - !cpumask_empty(to_cpumask(l->cpus))) - len += sysfs_emit_at(buf, len, " cpus=%*pbl", - cpumask_pr_args(to_cpumask(l->cpus))); - - if (nr_online_nodes > 1 && !nodes_empty(l->nodes)) - len += sysfs_emit_at(buf, len, " nodes=%*pbl", - nodemask_pr_args(&l->nodes)); - - len += sysfs_emit_at(buf, len, "\n"); - } - - free_loc_track(&t); - if (!t.count) - len += sysfs_emit_at(buf, len, "No data\n"); - - return len; -} +#endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */ #ifdef CONFIG_SYSFS @@ -5343,21 +5281,6 @@ static ssize_t validate_store(struct kmem_cache *s, } SLAB_ATTR(validate); -static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_ALLOC); -} -SLAB_ATTR_RO(alloc_calls); - -static ssize_t free_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_FREE); -} -SLAB_ATTR_RO(free_calls); #endif /* CONFIG_SLUB_DEBUG */ #ifdef CONFIG_FAILSLAB @@ -5521,8 +5444,6 @@ static struct attribute *slab_attrs[] = { &poison_attr.attr, &store_user_attr.attr, &validate_attr.attr, - &alloc_calls_attr.attr, - &free_calls_attr.attr, #endif #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, @@ -5810,6 +5731,173 @@ static int __init slab_sysfs_init(void) __initcall(slab_sysfs_init); #endif /* CONFIG_SYSFS */ +#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) +static int slab_debugfs_show(struct seq_file *seq, void *v) +{ + + struct location *l; + unsigned int idx = *(unsigned int *)v; + struct loc_track *t = seq->private; + + if (idx < t->count) { + l = &t->loc[idx]; + + seq_printf(seq, "%7ld ", l->count); + + if (l->addr) + seq_printf(seq, "%pS", (void *)l->addr); + else + seq_puts(seq, ""); + + if (l->sum_time != l->min_time) { + seq_printf(seq, " age=%ld/%llu/%ld", + l->min_time, div_u64(l->sum_time, l->count), + l->max_time); + } else + seq_printf(seq, " age=%ld", l->min_time); + + if (l->min_pid != l->max_pid) + seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid); + else + seq_printf(seq, " pid=%ld", + l->min_pid); + + if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus))) + seq_printf(seq, " cpus=%*pbl", + cpumask_pr_args(to_cpumask(l->cpus))); + + if (nr_online_nodes > 1 && !nodes_empty(l->nodes)) + seq_printf(seq, " nodes=%*pbl", + nodemask_pr_args(&l->nodes)); + + seq_puts(seq, "\n"); + } + + if (!idx && !t->count) + seq_puts(seq, "No data\n"); + + return 0; +} + +static void slab_debugfs_stop(struct seq_file *seq, void *v) +{ +} + +static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) +{ + struct loc_track *t = seq->private; + + v = ppos; + ++*ppos; + if (*ppos <= t->count) + return v; + + return NULL; +} + +static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) +{ + return ppos; +} + +static const struct seq_operations slab_debugfs_sops = { + .start = slab_debugfs_start, + .next = slab_debugfs_next, + .stop = slab_debugfs_stop, + .show = slab_debugfs_show, +}; + +static int slab_debug_trace_open(struct inode *inode, struct file *filep) +{ + + struct kmem_cache_node *n; + enum track_item alloc; + int node; + struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops, + sizeof(struct loc_track)); + struct kmem_cache *s = file_inode(filep)->i_private; + + if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) + alloc = TRACK_ALLOC; + else + alloc = TRACK_FREE; + + if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) + return -ENOMEM; + + /* Push back cpu slabs */ + flush_all(s); + + for_each_kmem_cache_node(s, node, n) { + unsigned long flags; + struct page *page; + + if (!atomic_long_read(&n->nr_slabs)) + continue; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, slab_list) + process_slab(t, s, page, alloc); + list_for_each_entry(page, &n->full, slab_list) + process_slab(t, s, page, alloc); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + return 0; +} + +static int slab_debug_trace_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct loc_track *t = seq->private; + + free_loc_track(t); + return seq_release_private(inode, file); +} + +static const struct file_operations slab_debugfs_fops = { + .open = slab_debug_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = slab_debug_trace_release, +}; + +static void debugfs_slab_add(struct kmem_cache *s) +{ + struct dentry *slab_cache_dir; + + if (unlikely(!slab_debugfs_root)) + return; + + slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root); + + debugfs_create_file("alloc_traces", 0400, + slab_cache_dir, s, &slab_debugfs_fops); + + debugfs_create_file("free_traces", 0400, + slab_cache_dir, s, &slab_debugfs_fops); +} + +void debugfs_slab_release(struct kmem_cache *s) +{ + debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root)); +} + +static int __init slab_debugfs_init(void) +{ + struct kmem_cache *s; + + slab_debugfs_root = debugfs_create_dir("slab", NULL); + + list_for_each_entry(s, &slab_caches, list) + if (s->flags & SLAB_STORE_USER) + debugfs_slab_add(s); + + return 0; + +} +__initcall(slab_debugfs_init); +#endif /* * The /proc/slabinfo ABI */ -- cgit From 65ebdeef103fd70988fdd0ffef1d4fecb0cb97ed Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Mon, 28 Jun 2021 19:34:58 -0700 Subject: mm/slub: add taint after the errors are printed When running the kernel with panic_on_taint, the usual slub debug error messages are not being printed when object corruption happens. That's because we panic in add_taint(), which is called before printing the additional information. This is a bit unfortunate as the error messages are actually very useful, especially before a panic. Let's fix this by moving add_taint() after the errors are printed on the console. Link: https://lkml.kernel.org/r/1623860738-146761-1-git-send-email-quic_c_gdjako@quicinc.com Signed-off-by: Georgi Djakov Acked-by: Rafael Aquini Acked-by: David Rientjes Acked-by: Vlastimil Babka Reviewed-by: Aaron Tomlin Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 70bb844a44d2..3bc8b940c933 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -708,8 +708,6 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) pr_err("=============================================================================\n"); pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); - - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); va_end(args); } @@ -790,6 +788,7 @@ void object_err(struct kmem_cache *s, struct page *page, slab_bug(s, "%s", reason); print_trailer(s, page, object); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, @@ -807,6 +806,7 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, slab_bug(s, "%s", buf); print_page_info(page); dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } static void init_object(struct kmem_cache *s, void *object, u8 val) @@ -858,6 +858,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, fault, end - 1, fault - addr, fault[0], value); print_trailer(s, page, object); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); skip_bug_print: restore_bytes(s, what, value, fault, end); -- cgit From 54dd200c5a251b5db9f6f0f72a251c28e0d7da43 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Mon, 28 Jun 2021 19:35:01 -0700 Subject: mm/kmemleak: fix possible wrong memory scanning period This commit contains 3 modifications: 1. Convert the type of jiffies_scan_wait to "unsigned long". 2. Use READ/WRITE_ONCE() for accessing "jiffies_scan_wait". 3. Fix the possible wrong memory scanning period. If you set a large memory scanning period like blow, then the "secs" variable will be non-zero, however the value of "jiffies_scan_wait" will be zero. echo "scan=0x10000000" > /sys/kernel/debug/kmemleak It is because the type of the msecs_to_jiffies()'s parameter is "unsigned int", and the "secs * 1000" is larger than its max value. This in turn leads a unexpected jiffies_scan_wait, maybe zero. We corret it by replacing kstrtoul() with kstrtouint(), and check the msecs to prevent it larger than UINT_MAX. Link: https://lkml.kernel.org/r/20210613174022.23044-1-yanfei.xu@windriver.com Signed-off-by: Yanfei Xu Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 92a2d4885808..228a2fbe0657 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -219,7 +219,7 @@ static struct task_struct *scan_thread; static unsigned long jiffies_min_age; static unsigned long jiffies_last_scan; /* delay between automatic memory scannings */ -static signed long jiffies_scan_wait; +static unsigned long jiffies_scan_wait; /* enables or disables the task stacks scanning */ static int kmemleak_stack_scan = 1; /* protects the memory scanning, parameters and debug/kmemleak file access */ @@ -1567,7 +1567,7 @@ static int kmemleak_scan_thread(void *arg) } while (!kthread_should_stop()) { - signed long timeout = jiffies_scan_wait; + signed long timeout = READ_ONCE(jiffies_scan_wait); mutex_lock(&scan_mutex); kmemleak_scan(); @@ -1807,14 +1807,20 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, else if (strncmp(buf, "scan=off", 8) == 0) stop_scan_thread(); else if (strncmp(buf, "scan=", 5) == 0) { - unsigned long secs; + unsigned secs; + unsigned long msecs; - ret = kstrtoul(buf + 5, 0, &secs); + ret = kstrtouint(buf + 5, 0, &secs); if (ret < 0) goto out; + + msecs = secs * MSEC_PER_SEC; + if (msecs > UINT_MAX) + msecs = UINT_MAX; + stop_scan_thread(); - if (secs) { - jiffies_scan_wait = msecs_to_jiffies(secs * 1000); + if (msecs) { + WRITE_ONCE(jiffies_scan_wait, msecs_to_jiffies(msecs)); start_scan_thread(); } } else if (strncmp(buf, "scan", 4) == 0) -- cgit From 1a14e3779dd58c16b30e56558146e5cc850ba8b0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 28 Jun 2021 19:35:04 -0700 Subject: dax: fix ENOMEM handling in grab_mapping_entry() grab_mapping_entry() has a bug in handling of ENOMEM condition. Suppose we have a PMD entry at index i which we are downgrading to a PTE entry. grab_mapping_entry() will set pmd_downgrade to true, lock the entry, clear the entry in xarray, and decrement mapping->nrpages. The it will call: entry = dax_make_entry(pfn_to_pfn_t(0), flags); dax_lock_entry(xas, entry); which inserts new PTE entry into xarray. However this may fail allocating the new node. We handle this by: if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) goto retry; however pmd_downgrade stays set to true even though 'entry' returned from get_unlocked_entry() will be NULL now. And we will go again through the downgrade branch. This is mostly harmless except that mapping->nrpages is decremented again and we temporarily have an invalid entry stored in xarray. Fix the problem by setting pmd_downgrade to false each time we lookup the entry we work with so that it matches the entry we found. Link: https://lkml.kernel.org/r/20210622160015.18004-1-jack@suse.cz Fixes: b15cd800682f ("dax: Convert page fault handlers to XArray") Signed-off-by: Jan Kara Reviewed-by: Dan Williams Cc: Matthew Wilcox Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 62352cbcf0f4..da41f9363568 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -488,10 +488,11 @@ static void *grab_mapping_entry(struct xa_state *xas, struct address_space *mapping, unsigned int order) { unsigned long index = xas->xa_index; - bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ + bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ void *entry; retry: + pmd_downgrade = false; xas_lock_irq(xas); entry = get_unlocked_entry(xas, order); -- cgit From 85f29cd6a12d430706c39247e7d0207590f581df Mon Sep 17 00:00:00 2001 From: Tang Bin Date: Mon, 28 Jun 2021 19:35:07 -0700 Subject: tools/vm/page_owner_sort.c: check malloc() return Link: https://lkml.kernel.org/r/20210506131402.10416-1-tangbin@cmss.chinamobile.com Signed-off-by: Zhang Shengju Signed-off-by: Tang Bin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page_owner_sort.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 85eb65ea16d3..0e75f22c9475 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -132,6 +132,10 @@ int main(int argc, char **argv) qsort(list, list_size, sizeof(list[0]), compare_txt); list2 = malloc(sizeof(*list) * list_size); + if (!list2) { + printf("Out of memory\n"); + exit(1); + } printf("culling\n"); -- cgit From 65ac1a60a57e2c55f2ac37f27095f6b012295e81 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 28 Jun 2021 19:35:10 -0700 Subject: mm/debug_vm_pgtable: ensure THP availability via has_transparent_hugepage() On certain platforms, THP support could not just be validated via the build option CONFIG_TRANSPARENT_HUGEPAGE. Instead has_transparent_hugepage() also needs to be called upon to verify THP runtime support. Otherwise the debug test will just run into unusable THP helpers like in the case of a 4K hash config on powerpc platform [1]. This just moves all pfn_pmd() and pfn_pud() after THP runtime validation with has_transparent_hugepage() which prevents the mentioned problem. [1] https://bugzilla.kernel.org/show_bug.cgi?id=213069 Link: https://lkml.kernel.org/r/1621397588-19211-1-git-send-email-anshuman.khandual@arm.com Fixes: 787d563b8642 ("mm/debug_vm_pgtable: fix kernel crash by checking for THP support") Signed-off-by: Anshuman Khandual Cc: Aneesh Kumar K.V Cc: Christophe Leroy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 63 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 297d1b349c19..92bfc37300df 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -146,13 +146,14 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) static void __init pmd_basic_tests(unsigned long pfn, int idx) { pgprot_t prot = protection_map[idx]; - pmd_t pmd = pfn_pmd(pfn, prot); unsigned long val = idx, *ptr = &val; + pmd_t pmd; if (!has_transparent_hugepage()) return; pr_debug("Validating PMD basic (%pGv)\n", ptr); + pmd = pfn_pmd(pfn, prot); /* * This test needs to be executed after the given page table entry @@ -185,7 +186,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, unsigned long pfn, unsigned long vaddr, pgprot_t prot, pgtable_t pgtable) { - pmd_t pmd = pfn_pmd(pfn, prot); + pmd_t pmd; if (!has_transparent_hugepage()) return; @@ -232,9 +233,14 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { - pmd_t pmd = pfn_pmd(pfn, prot); + pmd_t pmd; + + if (!has_transparent_hugepage()) + return; pr_debug("Validating PMD leaf\n"); + pmd = pfn_pmd(pfn, prot); + /* * PMD based THP is a leaf entry. */ @@ -267,12 +273,16 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { - pmd_t pmd = pfn_pmd(pfn, prot); + pmd_t pmd; if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) return; + if (!has_transparent_hugepage()) + return; + pr_debug("Validating PMD saved write\n"); + pmd = pfn_pmd(pfn, prot); WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); } @@ -281,13 +291,14 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { pgprot_t prot = protection_map[idx]; - pud_t pud = pfn_pud(pfn, prot); unsigned long val = idx, *ptr = &val; + pud_t pud; if (!has_transparent_hugepage()) return; pr_debug("Validating PUD basic (%pGv)\n", ptr); + pud = pfn_pud(pfn, prot); /* * This test needs to be executed after the given page table entry @@ -323,7 +334,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm, unsigned long pfn, unsigned long vaddr, pgprot_t prot) { - pud_t pud = pfn_pud(pfn, prot); + pud_t pud; if (!has_transparent_hugepage()) return; @@ -332,6 +343,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm, /* Align the address wrt HPAGE_PUD_SIZE */ vaddr &= HPAGE_PUD_MASK; + pud = pfn_pud(pfn, prot); set_pud_at(mm, vaddr, pudp, pud); pudp_set_wrprotect(mm, vaddr, pudp); pud = READ_ONCE(*pudp); @@ -370,9 +382,13 @@ static void __init pud_advanced_tests(struct mm_struct *mm, static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { - pud_t pud = pfn_pud(pfn, prot); + pud_t pud; + + if (!has_transparent_hugepage()) + return; pr_debug("Validating PUD leaf\n"); + pud = pfn_pud(pfn, prot); /* * PUD based THP is a leaf entry. */ @@ -654,12 +670,16 @@ static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot) #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { - pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); + pmd_t pmd; if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) return; + if (!has_transparent_hugepage()) + return; + pr_debug("Validating PMD protnone\n"); + pmd = pmd_mkhuge(pfn_pmd(pfn, prot)); WARN_ON(!pmd_protnone(pmd)); WARN_ON(!pmd_present(pmd)); } @@ -679,18 +699,26 @@ static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { - pmd_t pmd = pfn_pmd(pfn, prot); + pmd_t pmd; + + if (!has_transparent_hugepage()) + return; pr_debug("Validating PMD devmap\n"); + pmd = pfn_pmd(pfn, prot); WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd))); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { - pud_t pud = pfn_pud(pfn, prot); + pud_t pud; + + if (!has_transparent_hugepage()) + return; pr_debug("Validating PUD devmap\n"); + pud = pfn_pud(pfn, prot); WARN_ON(!pud_devmap(pud_mkdevmap(pud))); } #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -733,25 +761,33 @@ static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { - pmd_t pmd = pfn_pmd(pfn, prot); + pmd_t pmd; if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return; + if (!has_transparent_hugepage()) + return; + pr_debug("Validating PMD soft dirty\n"); + pmd = pfn_pmd(pfn, prot); WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd))); WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd))); } static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { - pmd_t pmd = pfn_pmd(pfn, prot); + pmd_t pmd; if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) || !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION)) return; + if (!has_transparent_hugepage()) + return; + pr_debug("Validating PMD swap soft dirty\n"); + pmd = pfn_pmd(pfn, prot); WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); } @@ -780,6 +816,9 @@ static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) swp_entry_t swp; pmd_t pmd; + if (!has_transparent_hugepage()) + return; + pr_debug("Validating PMD swap\n"); pmd = pfn_pmd(pfn, prot); swp = __pmd_to_swp_entry(pmd); -- cgit From 832b50725373e8c46781b7d4db104ec9cf564a6b Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Mon, 28 Jun 2021 19:35:13 -0700 Subject: mm: mmap_lock: use local locks instead of disabling preemption mmap_lock will explicitly disable/enable preemption upon manipulating its local CPU variables. This is to be expected, but in this case, it doesn't play well with PREEMPT_RT. The preemption disabled code section also takes a spin-lock. Spin-locks in RT systems will try to schedule, which is exactly what we're trying to avoid. To mitigate this, convert the explicit preemption handling to local_locks. Which are RT aware, and will disable migration instead of preemption when PREEMPT_RT=y. The faulty call trace looks like the following: __mmap_lock_do_trace_*() preempt_disable() get_mm_memcg_path() cgroup_path() kernfs_path_from_node() spin_lock_irqsave() /* Scheduling while atomic! */ Link: https://lkml.kernel.org/r/20210604163506.2103900-1-nsaenzju@redhat.com Fixes: 2b5067a8143e3 ("mm: mmap_lock: add tracepoints around lock acquisition ") Signed-off-by: Nicolas Saenz Julienne Tested-by: Axel Rasmussen Reviewed-by: Axel Rasmussen Cc: Vlastimil Babka Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap_lock.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index dcdde4f722a4..2ae3f33b85b1 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -11,6 +11,7 @@ #include #include #include +#include EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); @@ -39,21 +40,30 @@ static int reg_refcount; /* Protected by reg_lock. */ */ #define CONTEXT_COUNT 4 -static DEFINE_PER_CPU(char __rcu *, memcg_path_buf); +struct memcg_path { + local_lock_t lock; + char __rcu *buf; + local_t buf_idx; +}; +static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = { + .lock = INIT_LOCAL_LOCK(lock), + .buf_idx = LOCAL_INIT(0), +}; + static char **tmp_bufs; -static DEFINE_PER_CPU(int, memcg_path_buf_idx); /* Called with reg_lock held. */ static void free_memcg_path_bufs(void) { + struct memcg_path *memcg_path; int cpu; char **old = tmp_bufs; for_each_possible_cpu(cpu) { - *(old++) = rcu_dereference_protected( - per_cpu(memcg_path_buf, cpu), + memcg_path = per_cpu_ptr(&memcg_paths, cpu); + *(old++) = rcu_dereference_protected(memcg_path->buf, lockdep_is_held(®_lock)); - rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL); + rcu_assign_pointer(memcg_path->buf, NULL); } /* Wait for inflight memcg_path_buf users to finish. */ @@ -88,7 +98,7 @@ int trace_mmap_lock_reg(void) new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); if (new == NULL) goto out_fail_free; - rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new); + rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new); /* Don't need to wait for inflights, they'd have gotten NULL. */ } @@ -122,23 +132,24 @@ out: static inline char *get_memcg_path_buf(void) { + struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths); char *buf; int idx; rcu_read_lock(); - buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf)); + buf = rcu_dereference(memcg_path->buf); if (buf == NULL) { rcu_read_unlock(); return NULL; } - idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) - + idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) - MEMCG_PATH_BUF_SIZE; return &buf[idx]; } static inline void put_memcg_path_buf(void) { - this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE); + local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx); rcu_read_unlock(); } @@ -179,14 +190,14 @@ out: #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ do { \ const char *memcg_path; \ - preempt_disable(); \ + local_lock(&memcg_paths.lock); \ memcg_path = get_mm_memcg_path(mm); \ trace_mmap_lock_##type(mm, \ memcg_path != NULL ? memcg_path : "", \ ##__VA_ARGS__); \ if (likely(memcg_path != NULL)) \ put_memcg_path_buf(); \ - preempt_enable(); \ + local_unlock(&memcg_paths.lock); \ } while (0) #else /* !CONFIG_MEMCG */ -- cgit From 5631de543acb5c7a740534e727f7432e45a9e6dd Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 28 Jun 2021 19:35:16 -0700 Subject: mm/page_reporting: fix code style in __page_reporting_request() Patch series "mm/page_reporting: Make page reporting work on arm64 with 64KB page size", v4. The page reporting threshold is currently equal to @pageblock_order, which is 13 and 512MB on arm64 with 64KB base page size selected. The page reporting won't be triggered if the freeing page can't come up with a free area like that huge. The condition is hard to be met, especially when the system memory becomes fragmented. This series intends to solve the issue by having page reporting threshold as 5 (2MB) on arm64 with 64KB base page size. The patches are organized as: PATCH[1/4] Fix some coding style in __page_reporting_request(). PATCH[2/4] Represents page reporting order with variable so that it can be exported as module parameter. PATCH[3/4] Allows the device driver (e.g. virtio_balloon) to specify the page reporting order when the device info is registered. PATCH[4/4] Specifies the page reporting order to 5, corresponding to 2MB in size on ARM64 when 64KB base page size is used. This patch (of 4): The lines of comments would be starting with one, instead two space. This corrects the style. Link: https://lkml.kernel.org/r/20210625014710.42954-1-gshan@redhat.com Link: https://lkml.kernel.org/r/20210625014710.42954-2-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Alexander Duyck Cc: David Hildenbrand Cc: "Michael S. Tsirkin" Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_reporting.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_reporting.c b/mm/page_reporting.c index c50d93ffa252..df9c5054e1b4 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -31,8 +31,8 @@ __page_reporting_request(struct page_reporting_dev_info *prdev) return; /* - * If reporting is already active there is nothing we need to do. - * Test against 0 as that represents PAGE_REPORTING_IDLE. + * If reporting is already active there is nothing we need to do. + * Test against 0 as that represents PAGE_REPORTING_IDLE. */ state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); if (state != PAGE_REPORTING_IDLE) -- cgit From f58780a8e3851edae5bafb7d3af19425308a37f5 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 28 Jun 2021 19:35:19 -0700 Subject: mm/page_reporting: export reporting order as module parameter The macro PAGE_REPORTING_MIN_ORDER is defined as the page reporting threshold. It can't be adjusted at runtime. This introduces a variable (@page_reporting_order) to replace the marcro (PAGE_REPORTING_MIN_ORDER). MAX_ORDER is assigned to it initially, meaning the page reporting is disabled. It will be specified by driver if valid one is provided. Otherwise, it will fall back to @pageblock_order. It's also exported so that the page reporting order can be adjusted at runtime. Link: https://lkml.kernel.org/r/20210625014710.42954-3-gshan@redhat.com Signed-off-by: Gavin Shan Suggested-by: David Hildenbrand Reviewed-by: Alexander Duyck Cc: Anshuman Khandual Cc: Catalin Marinas Cc: "Michael S. Tsirkin" Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ mm/page_reporting.c | 9 +++++++-- mm/page_reporting.h | 5 ++--- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cb89dbdedc46..566c4b9af3cd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3566,6 +3566,12 @@ off: turn off poisoning (default) on: turn on poisoning + page_reporting.page_reporting_order= + [KNL] Minimal page reporting order + Format: + Adjust the minimal page reporting order. The page + reporting is disabled when it exceeds (MAX_ORDER-1). + panic= [KNL] Kernel behaviour on panic: delay timeout > 0: seconds before rebooting timeout = 0: wait forever diff --git a/mm/page_reporting.c b/mm/page_reporting.c index df9c5054e1b4..34bf4d26c2c4 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -4,12 +4,17 @@ #include #include #include +#include #include #include #include "page_reporting.h" #include "internal.h" +unsigned int page_reporting_order = MAX_ORDER; +module_param(page_reporting_order, uint, 0644); +MODULE_PARM_DESC(page_reporting_order, "Set page reporting order"); + #define PAGE_REPORTING_DELAY (2 * HZ) static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; @@ -229,7 +234,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev, /* Generate minimum watermark to be able to guarantee progress */ watermark = low_wmark_pages(zone) + - (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER); + (PAGE_REPORTING_CAPACITY << page_reporting_order); /* * Cancel request if insufficient free memory or if we failed @@ -239,7 +244,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev, return err; /* Process each free list starting from lowest order/mt */ - for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) { + for (order = page_reporting_order; order < MAX_ORDER; order++) { for (mt = 0; mt < MIGRATE_TYPES; mt++) { /* We do not pull pages from the isolate free list */ if (is_migrate_isolate(mt)) diff --git a/mm/page_reporting.h b/mm/page_reporting.h index 2c385dd4ddbd..c51dbc228b94 100644 --- a/mm/page_reporting.h +++ b/mm/page_reporting.h @@ -10,10 +10,9 @@ #include #include -#define PAGE_REPORTING_MIN_ORDER pageblock_order - #ifdef CONFIG_PAGE_REPORTING DECLARE_STATIC_KEY_FALSE(page_reporting_enabled); +extern unsigned int page_reporting_order; void __page_reporting_notify(void); static inline bool page_reported(struct page *page) @@ -38,7 +37,7 @@ static inline void page_reporting_notify_free(unsigned int order) return; /* Determine if we have crossed reporting threshold */ - if (order < PAGE_REPORTING_MIN_ORDER) + if (order < page_reporting_order) return; /* This will add a few cycles, but should be called infrequently */ -- cgit From 9f849c6f9572d8cef407f55928d3dc68fc42ad3e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 28 Jun 2021 19:35:22 -0700 Subject: mm/page_reporting: allow driver to specify reporting order The page reporting order (threshold) is sticky to @pageblock_order by default. The page reporting can never be triggered because the freeing page can't come up with a free area like that huge. The situation becomes worse when the system memory becomes heavily fragmented. For example, the following configurations are used on ARM64 when 64KB base page size is enabled. In this specific case, the page reporting won't be triggered until the freeing page comes up with a 512MB free area. That's hard to be met, especially when the system memory becomes heavily fragmented. PAGE_SIZE: 64KB HPAGE_SIZE: 512MB pageblock_order: 13 (512MB) MAX_ORDER: 14 This allows the drivers to specify the page reporting order when the page reporting device is registered. It falls back to @pageblock_order if it's not specified by the driver. The existing users (hv_balloon and virtio_balloon) don't specify it and @pageblock_order is still taken as their page reporting order. So this shouldn't introduce any functional changes. Link: https://lkml.kernel.org/r/20210625014710.42954-4-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Alexander Duyck Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Hildenbrand Cc: "Michael S. Tsirkin" Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_reporting.h | 3 +++ mm/page_reporting.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h index 3b99e0ec24f2..fe648dfa3a7c 100644 --- a/include/linux/page_reporting.h +++ b/include/linux/page_reporting.h @@ -18,6 +18,9 @@ struct page_reporting_dev_info { /* Current state of page reporting */ atomic_t state; + + /* Minimal order of page reporting */ + unsigned int order; }; /* Tear-down and bring-up for page reporting devices */ diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 34bf4d26c2c4..382958eef8a9 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -329,6 +329,12 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) goto err_out; } + /* + * Update the page reporting order if it's specified by driver. + * Otherwise, it falls back to @pageblock_order. + */ + page_reporting_order = prdev->order ? : pageblock_order; + /* initialize state and work structures */ atomic_set(&prdev->state, PAGE_REPORTING_IDLE); INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); -- cgit From f8af4d0892cbb84fc3913de75ba5da374147a691 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 28 Jun 2021 19:35:25 -0700 Subject: virtio_balloon: specify page reporting order if needed The page reporting won't be triggered if the freeing page can't come up with a free area, whose size is equal or bigger than the threshold (page reporting order). The default page reporting order, equal to @pageblock_order, is too huge on some architectures to trigger page reporting. One example is ARM64 when 64KB base page size is used. PAGE_SIZE: 64KB pageblock_order: 13 (512MB) MAX_ORDER: 14 This specifies the page reporting order to 5 (2MB) for this specific case so that page reporting can be triggered. Link: https://lkml.kernel.org/r/20210625014710.42954-5-gshan@redhat.com Signed-off-by: Gavin Shan Reviewed-by: Alexander Duyck Cc: Michael S. Tsirkin Cc: David Hildenbrand Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/virtio/virtio_balloon.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 510e9318854d..47dce91f788c 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -993,6 +993,23 @@ static int virtballoon_probe(struct virtio_device *vdev) goto out_unregister_oom; } + /* + * The default page reporting order is @pageblock_order, which + * corresponds to 512MB in size on ARM64 when 64KB base page + * size is used. The page reporting won't be triggered if the + * freeing page can't come up with a free area like that huge. + * So we specify the page reporting order to 5, corresponding + * to 2MB. It helps to avoid THP splitting if 4KB base page + * size is used by host. + * + * Ideally, the page reporting order is selected based on the + * host's base page size. However, it needs more work to report + * that value. The hard-coded order would be fine currently. + */ +#if defined(CONFIG_ARM64) && defined(CONFIG_ARM64_64K_PAGES) + vb->pr_dev_info.order = 5; +#endif + err = page_reporting_register(&vb->pr_dev_info); if (err) goto out_unregister_oom; -- cgit From 5defd497ed78fdc2bad115b0b4316c0c0de8b485 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 28 Jun 2021 19:35:28 -0700 Subject: mm: page-writeback: kill get_writeback_state() comments The get_writeback_state() has gone since 2006, kill related comments. Link: https://lkml.kernel.org/r/20210508125026.56600-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0062d5c57d41..1bbe185a6524 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1869,10 +1869,9 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * - * On really big machines, get_writeback_state is expensive, so try to avoid - * calling it too often (ratelimiting). But once we're over the dirty memory - * limit we decrease the ratelimiting by a lot, to prevent individual processes - * from overshooting the limit by (ratelimit_pages) each. + * Once we're over the dirty memory limit we decrease the ratelimiting + * by a lot, to prevent individual processes from overshooting the limit + * by (ratelimit_pages) each. */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { @@ -2045,8 +2044,6 @@ void laptop_sync_completion(void) /* * If ratelimit_pages is too high then we can get into dirty-data overload * if a large number of processes all perform writes at the same time. - * If it is too low then SMP machines will call the (expensive) - * get_writeback_state too often. * * Here we set ratelimit_pages to a level which ensures that when all CPUs are * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory -- cgit From ab19939a6a5010cba4e9cb04dd8bee03c72edcbd Mon Sep 17 00:00:00 2001 From: Chi Wu Date: Mon, 28 Jun 2021 19:35:31 -0700 Subject: mm/page-writeback: Fix performance when BDI's share of ratio is 0. Fix performance when BDI's share of ratio is 0. The issue is similar to commit 74d369443325 ("writeback: Fix performance regression in wb_over_bg_thresh()"). Balance_dirty_pages and the writeback worker will also disagree on whether writeback when a BDI uses BDI_CAP_STRICTLIMIT and BDI's share of the thresh ratio is zero. For example, A thread on cpu0 writes 32 pages and then balance_dirty_pages, it will wake up background writeback and pauses because wb_dirty > wb->wb_thresh = 0 (share of thresh ratio is zero). A thread may runs on cpu0 again because scheduler prefers pre_cpu. Then writeback worker may runs on other cpus(1,2..) which causes the value of wb_stat(wb, WB_RECLAIMABLE) in wb_over_bg_thresh is 0 and does not writeback and returns. Thus, balance_dirty_pages keeps looping, sleeping and then waking up the worker who will do nothing. It remains stuck in this state until the writeback worker hit the right dirty cpu or the dirty pages expire. The fix that we should get the wb_stat_sum radically when thresh is low. Link: https://lkml.kernel.org/r/20210428225046.16301-1-wuchi.zero@gmail.com Signed-off-by: Chi Wu Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Miklos Szeredi Cc: Sedat Dilek Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1bbe185a6524..aff3205dfde5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1944,6 +1944,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; + unsigned long reclaimable; + unsigned long thresh; /* * Similar to balance_dirty_pages() but ignores pages being written @@ -1956,8 +1958,13 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (gdtc->dirty > gdtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > - wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) + thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh); + if (thresh < 2 * wb_stat_error()) + reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); + else + reclaimable = wb_stat(wb, WB_RECLAIMABLE); + + if (reclaimable > thresh) return true; if (mdtc) { @@ -1971,8 +1978,13 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (mdtc->dirty > mdtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > - wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) + thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh); + if (thresh < 2 * wb_stat_error()) + reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); + else + reclaimable = wb_stat(wb, WB_RECLAIMABLE); + + if (reclaimable > thresh) return true; } -- cgit From 0323155437870dbbae6e30fb659d7514c9f649da Mon Sep 17 00:00:00 2001 From: Chi Wu Date: Mon, 28 Jun 2021 19:35:34 -0700 Subject: mm/page-writeback: update the comment of Dirty position control As the value of pos_ratio_polynom() clamp between 0 and 2LL << RATELIMIT_CALC_SHIFT, the global control line should be consistent with it. Link: https://lkml.kernel.org/r/20210511103606.3732-1-wuchi.zero@gmail.com Signed-off-by: Chi Wu Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Howard Cochran Cc: Miklos Szeredi Cc: Sedat Dilek Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index aff3205dfde5..d0f090d682f5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -845,7 +845,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, * ^ pos_ratio * | * | |<===== global dirty control scope ======>| - * 2.0 .............* + * 2.0 * * * * * * * * | .* * | . * * | . * -- cgit From 87e3789749750d83aa085f04f74242087de0154b Mon Sep 17 00:00:00 2001 From: Chi Wu Date: Mon, 28 Jun 2021 19:35:37 -0700 Subject: mm/page-writeback: use __this_cpu_inc() in account_page_dirtied() As account_page_dirtied() was always protected by xa_lock_irqsave(), so using __this_cpu_inc() is better. Link: https://lkml.kernel.org/r/20210512144742.4764-1-wuchi.zero@gmail.com Signed-off-by: Chi Wu Reviewed-by: Jan Kara Cc: Howard Cochran Cc: Miklos Szeredi Cc: Sedat Dilek Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d0f090d682f5..c2a849d653a9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2445,7 +2445,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_SIZE); current->nr_dirtied++; - this_cpu_inc(bdp_ratelimits); + __this_cpu_inc(bdp_ratelimits); mem_cgroup_track_foreign_dirty(page, wb); } -- cgit From 4ade5867b4b878b00a4526b8621442f9442536ce Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:41 -0700 Subject: writeback, cgroup: do not switch inodes with I_WILL_FREE flag Patch series "cgroup, blkcg: prevent dirty inodes to pin dying memory cgroups", v9. When an inode is getting dirty for the first time it's associated with a wb structure (see __inode_attach_wb()). It can later be switched to another wb (if e.g. some other cgroup is writing a lot of data to the same inode), but otherwise stays attached to the original wb until being reclaimed. The problem is that the wb structure holds a reference to the original memory and blkcg cgroups. So if an inode has been dirty once and later is actively used in read-only mode, it has a good chance to pin down the original memory and blkcg cgroups forever. This is often the case with services bringing data for other services, e.g. updating some rpm packages. In the real life it becomes a problem due to a large size of the memcg structure, which can easily be 1000x larger than an inode. Also a really large number of dying cgroups can raise different scalability issues, e.g. making the memory reclaim costly and less effective. To solve the problem inodes should be eventually detached from the corresponding writeback structure. It's inefficient to do it after every writeback completion. Instead it can be done whenever the original memory cgroup is offlined and writeback structure is getting killed. Scanning over a (potentially long) list of inodes and detach them from the writeback structure can take quite some time. To avoid scanning all inodes, attached inodes are kept on a new list (b_attached). To make it less noticeable to a user, the scanning and switching is performed from a work context. Big thanks to Jan Kara, Dennis Zhou, Hillf Danton and Tejun Heo for their ideas and contribution to this patchset. This patch (of 8): If an inode's state has I_WILL_FREE flag set, the inode will be freed soon, so there is no point in trying to switch the inode to a different cgwb. I_WILL_FREE was ignored since the introduction of the inode switching, so it looks like it doesn't lead to any noticeable issues for a user. This is why the patch is not intended for a stable backport. Link: https://lkml.kernel.org/r/20210608230225.2078447-1-guro@fb.com Link: https://lkml.kernel.org/r/20210608230225.2078447-2-guro@fb.com Signed-off-by: Roman Gushchin Suggested-by: Jan Kara Acked-by: Tejun Heo Reviewed-by: Jan Kara Acked-by: Dennis Zhou Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e91980f49388..bd99890599e0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -389,10 +389,10 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) xa_lock_irq(&mapping->i_pages); /* - * Once I_FREEING is visible under i_lock, the eviction path owns - * the inode and we shouldn't modify ->i_io_list. + * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction + * path owns the inode and we shouldn't modify ->i_io_list. */ - if (unlikely(inode->i_state & I_FREEING)) + if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); @@ -517,7 +517,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & SB_ACTIVE) || - inode->i_state & (I_WB_SWITCH | I_FREEING) || + inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == isw->new_wb) { spin_unlock(&inode->i_lock); goto out_free; -- cgit From 592fa002180af3425ba962b8e74edd680f0ec77b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:44 -0700 Subject: writeback, cgroup: add smp_mb() to cgroup_writeback_umount() A full memory barrier is required between clearing SB_ACTIVE flag in generic_shutdown_super() and checking isw_nr_in_flight in cgroup_writeback_umount(), otherwise a new switch operation might be scheduled after atomic_read(&isw_nr_in_flight) returned 0. This would result in a non-flushed isw_wq, and a potential crash. The problem hasn't yet been seen in the real life and was discovered by Jan Kara by looking into the code. Link: https://lkml.kernel.org/r/20210608230225.2078447-3-guro@fb.com Signed-off-by: Roman Gushchin Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dave Chinner Cc: Dennis Zhou Cc: Jan Kara Cc: Tejun Heo Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index bd99890599e0..3564efcc4b78 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1000,6 +1000,12 @@ out_bdi_put: */ void cgroup_writeback_umount(void) { + /* + * SB_ACTIVE should be reliably cleared before checking + * isw_nr_in_flight, see generic_shutdown_super(). + */ + smp_mb(); + if (atomic_read(&isw_nr_in_flight)) { /* * Use rcu_barrier() to wait for all pending callbacks to -- cgit From 8826ee4fe75051f8cbfa5d4a9aa70565938e724c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:47 -0700 Subject: writeback, cgroup: increment isw_nr_in_flight before grabbing an inode isw_nr_in_flight is used to determine whether the inode switch queue should be flushed from the umount path. Currently it's increased after grabbing an inode and even scheduling the switch work. It means the umount path can walk past cleanup_offline_cgwb() with active inode references, which can result in a "Busy inodes after unmount." message and use-after-free issues (with inode->i_sb which gets freed). Fix it by incrementing isw_nr_in_flight before doing anything with the inode and decrementing in the case when switching wasn't scheduled. The problem hasn't yet been seen in the real life and was discovered by Jan Kara by looking into the code. Link: https://lkml.kernel.org/r/20210608230225.2078447-4-guro@fb.com Signed-off-by: Roman Gushchin Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dave Chinner Cc: Dennis Zhou Cc: Tejun Heo Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3564efcc4b78..e2cc860a001b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -505,6 +505,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!isw) return; + atomic_inc(&isw_nr_in_flight); + /* find and pin the new wb */ rcu_read_lock(); memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); @@ -535,11 +537,10 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); - - atomic_inc(&isw_nr_in_flight); return; out_free: + atomic_dec(&isw_nr_in_flight); if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); -- cgit From 29264d92a0f157f3147129066d912718b99fc6b0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:50 -0700 Subject: writeback, cgroup: switch to rcu_work API in inode_switch_wbs() Inode's wb switching requires two steps divided by an RCU grace period. It's currently implemented as an RCU callback inode_switch_wbs_rcu_fn(), which schedules inode_switch_wbs_work_fn() as a work. Switching to the rcu_work API allows to do the same in a cleaner and slightly shorter form. Link: https://lkml.kernel.org/r/20210608230225.2078447-5-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Jan Kara Acked-by: Tejun Heo Acked-by: Dennis Zhou Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e2cc860a001b..96974e13a203 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -335,8 +335,7 @@ struct inode_switch_wbs_context { struct inode *inode; struct bdi_writeback *new_wb; - struct rcu_head rcu_head; - struct work_struct work; + struct rcu_work work; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) @@ -352,7 +351,7 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = - container_of(work, struct inode_switch_wbs_context, work); + container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; @@ -469,16 +468,6 @@ skip_switch: atomic_dec(&isw_nr_in_flight); } -static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) -{ - struct inode_switch_wbs_context *isw = container_of(rcu_head, - struct inode_switch_wbs_context, rcu_head); - - /* needs to grab bh-unsafe locks, bounce to work item */ - INIT_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_work(isw_wq, &isw->work); -} - /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -536,7 +525,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ - call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); + INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); + queue_rcu_work(isw_wq, &isw->work); return; out_free: -- cgit From f3b6a6df38aa514d97e8c6fcc748be1d4142bec9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:53 -0700 Subject: writeback, cgroup: keep list of inodes attached to bdi_writeback Currently there is no way to iterate over inodes attached to a specific cgwb structure. It limits the ability to efficiently reclaim the writeback structure itself and associated memory and block cgroup structures without scanning all inodes belonging to a sb, which can be prohibitively expensive. While dirty/in-active-writeback an inode belongs to one of the bdi_writeback's io lists: b_dirty, b_io, b_more_io and b_dirty_time. Once cleaned up, it's removed from all io lists. So the inode->i_io_list can be reused to maintain the list of inodes, attached to a bdi_writeback structure. This patch introduces a new wb->b_attached list, which contains all inodes which were dirty at least once and are attached to the given cgwb. Inodes attached to the root bdi_writeback structures are never placed on such list. The following patch will use this list to try to release cgwbs structures more efficiently. Link: https://lkml.kernel.org/r/20210608230225.2078447-6-guro@fb.com Signed-off-by: Roman Gushchin Suggested-by: Jan Kara Reviewed-by: Jan Kara Acked-by: Tejun Heo Acked-by: Dennis Zhou Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 93 +++++++++++++++++++++++++--------------- include/linux/backing-dev-defs.h | 1 + mm/backing-dev.c | 2 + 3 files changed, 62 insertions(+), 34 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 96974e13a203..87b305ee5348 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -131,25 +131,6 @@ static bool inode_io_list_move_locked(struct inode *inode, return false; } -/** - * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list - * @inode: inode to be removed - * @wb: bdi_writeback @inode is being removed from - * - * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and - * clear %WB_has_dirty_io if all are empty afterwards. - */ -static void inode_io_list_del_locked(struct inode *inode, - struct bdi_writeback *wb) -{ - assert_spin_locked(&wb->list_lock); - assert_spin_locked(&inode->i_lock); - - inode->i_state &= ~I_SYNC_QUEUED; - list_del_init(&inode->i_io_list); - wb_io_lists_depopulated(wb); -} - static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_bh(&wb->work_lock); @@ -278,6 +259,28 @@ void __inode_attach_wb(struct inode *inode, struct page *page) } EXPORT_SYMBOL_GPL(__inode_attach_wb); +/** + * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list + * @inode: inode of interest with i_lock held + * @wb: target bdi_writeback + * + * Remove the inode from wb's io lists and if necessarily put onto b_attached + * list. Only inodes attached to cgwb's are kept on this list. + */ +static void inode_cgwb_move_to_attached(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + + inode->i_state &= ~I_SYNC_QUEUED; + if (wb != &wb->bdi->wb) + list_move(&inode->i_io_list, &wb->b_attached); + else + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); +} + /** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * @inode: inode of interest with i_lock held @@ -418,21 +421,28 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) wb_get(new_wb); /* - * Transfer to @new_wb's IO list if necessary. The specific list - * @inode was on is ignored and the inode is put on ->b_dirty which - * is always correct including from ->b_dirty_time. The transfer - * preserves @inode->dirtied_when ordering. + * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, + * the specific list @inode was on is ignored and the @inode is put on + * ->b_dirty which is always correct including from ->b_dirty_time. + * The transfer preserves @inode->dirtied_when ordering. If the @inode + * was clean, it means it was on the b_attached list, so move it onto + * the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { - struct inode *pos; - - inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) - if (time_after_eq(inode->dirtied_when, - pos->dirtied_when)) - break; - inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); + + if (inode->i_state & I_DIRTY_ALL) { + struct inode *pos; + + list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) + if (time_after_eq(inode->dirtied_when, + pos->dirtied_when)) + break; + inode_io_list_move_locked(inode, new_wb, + pos->i_io_list.prev); + } else { + inode_cgwb_move_to_attached(inode, new_wb); + } } else { inode->i_wb = new_wb; } @@ -1021,6 +1031,17 @@ fs_initcall(cgroup_writeback_init); static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void inode_cgwb_move_to_attached(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + + inode->i_state &= ~I_SYNC_QUEUED; + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); +} + static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) @@ -1121,7 +1142,11 @@ void inode_io_list_del(struct inode *inode) wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); - inode_io_list_del_locked(inode, wb); + + inode->i_state &= ~I_SYNC_QUEUED; + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); + spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } @@ -1434,7 +1459,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ - inode_io_list_del_locked(inode, wb); + inode_cgwb_move_to_attached(inode, wb); } } @@ -1586,7 +1611,7 @@ static int writeback_single_inode(struct inode *inode, * responsible for the writeback lists. */ if (!(inode->i_state & I_DIRTY_ALL)) - inode_io_list_del_locked(inode, wb); + inode_cgwb_move_to_attached(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index fff9367a6348..e5dc238ebe4f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -154,6 +154,7 @@ struct bdi_writeback { struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ + struct list_head b_attached; /* attached inodes, protected by list_lock */ union { struct work_struct release_work; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 576220acd686..54c5dc4b8c24 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -396,6 +396,7 @@ static void cgwb_release_workfn(struct work_struct *work) fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); wb_exit(wb); + WARN_ON_ONCE(!list_empty(&wb->b_attached)); kfree_rcu(wb, rcu); } @@ -472,6 +473,7 @@ static int cgwb_create(struct backing_dev_info *bdi, wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; + INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); -- cgit From 72d4512e9cb14d790e361c0e085186a7ef2d2431 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:56 -0700 Subject: writeback, cgroup: split out the functional part of inode_switch_wbs_work_fn() Split out the functional part of the inode_switch_wbs_work_fn() function as inode_do switch_wbs() to reuse it later for switching inodes attached to dying cgwbs. This commit doesn't bring any functional changes. Link: https://lkml.kernel.org/r/20210608230225.2078447-7-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Jan Kara Acked-by: Tejun Heo Acked-by: Dennis Zhou Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 87b305ee5348..5520a6b5cc4d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -351,15 +351,12 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) up_write(&bdi->wb_switch_rwsem); } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static void inode_do_switch_wbs(struct inode *inode, + struct bdi_writeback *new_wb) { - struct inode_switch_wbs_context *isw = - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); - struct inode *inode = isw->inode; struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; XA_STATE(xas, &mapping->i_pages, 0); struct page *page; bool switched = false; @@ -470,11 +467,17 @@ skip_switch: wb_wakeup(new_wb); wb_put(old_wb); } - wb_put(new_wb); +} - iput(inode); - kfree(isw); +static void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct inode_switch_wbs_context *isw = + container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); + inode_do_switch_wbs(isw->inode, isw->new_wb); + wb_put(isw->new_wb); + iput(isw->inode); + kfree(isw); atomic_dec(&isw_nr_in_flight); } -- cgit From f5fbe6b7ad6ef1fbdf8074a6ca9fdab739bf86d4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:35:59 -0700 Subject: writeback, cgroup: support switching multiple inodes at once Currently only a single inode can be switched to another writeback structure at once. That means to switch an inode a separate inode_switch_wbs_context structure must be allocated, and a separate rcu callback and work must be scheduled. It's fine for the existing ad-hoc switching, which is not happening that often, but sub-optimal for massive switching required in order to release a writeback structure. To prepare for it, let's add a support for switching multiple inodes at once. Instead of containing a single inode pointer, inode_switch_wbs_context will contain a NULL-terminated array of inode pointers. inode_do_switch_wbs() will be called for each inode. To optimize the locking bdi->wb_switch_rwsem, old_wb's and new_wb's list_locks will be acquired and released only once altogether for all inodes. wb_wakeup() will be also be called only once. Instead of calling wb_put(old_wb) after each successful switch, wb_put_many() is introduced and used. Link: https://lkml.kernel.org/r/20210608230225.2078447-8-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Tejun Heo Reviewed-by: Jan Kara Acked-by: Dennis Zhou Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 106 +++++++++++++++++++++++---------------- include/linux/backing-dev-defs.h | 18 ++++++- 2 files changed, 80 insertions(+), 44 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5520a6b5cc4d..737ac27adb77 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -335,10 +335,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct inode *inode; - struct bdi_writeback *new_wb; - struct rcu_work work; + + /* + * Multiple inodes can be switched at once. The switching procedure + * consists of two parts, separated by a RCU grace period. To make + * sure that the second part is executed for each inode gone through + * the first part, all inode pointers are placed into a NULL-terminated + * array embedded into struct inode_switch_wbs_context. Otherwise + * an inode could be left in a non-consistent state. + */ + struct bdi_writeback *new_wb; + struct inode *inodes[]; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) @@ -351,39 +359,15 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) up_write(&bdi->wb_switch_rwsem); } -static void inode_do_switch_wbs(struct inode *inode, +static bool inode_do_switch_wbs(struct inode *inode, + struct bdi_writeback *old_wb, struct bdi_writeback *new_wb) { - struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; - struct bdi_writeback *old_wb = inode->i_wb; XA_STATE(xas, &mapping->i_pages, 0); struct page *page; bool switched = false; - /* - * If @inode switches cgwb membership while sync_inodes_sb() is - * being issued, sync_inodes_sb() might miss it. Synchronize. - */ - down_read(&bdi->wb_switch_rwsem); - - /* - * By the time control reaches here, RCU grace period has passed - * since I_WB_SWITCH assertion and all wb stat update transactions - * between unlocked_inode_to_wb_begin/end() are guaranteed to be - * synchronizing against the i_pages lock. - * - * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock - * gives us exclusion against all wb related operations on @inode - * including IO list manipulations and stat updates. - */ - if (old_wb < new_wb) { - spin_lock(&old_wb->list_lock); - spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); - } else { - spin_lock(&new_wb->list_lock); - spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); - } spin_lock(&inode->i_lock); xa_lock_irq(&mapping->i_pages); @@ -458,25 +442,63 @@ skip_switch: xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); - spin_unlock(&new_wb->list_lock); - spin_unlock(&old_wb->list_lock); - - up_read(&bdi->wb_switch_rwsem); - if (switched) { - wb_wakeup(new_wb); - wb_put(old_wb); - } + return switched; } static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); + struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); + struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; + struct bdi_writeback *new_wb = isw->new_wb; + unsigned long nr_switched = 0; + struct inode **inodep; + + /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions + * between unlocked_inode_to_wb_begin/end() are guaranteed to be + * synchronizing against the i_pages lock. + * + * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&new_wb->list_lock); + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + + for (inodep = isw->inodes; *inodep; inodep++) { + WARN_ON_ONCE((*inodep)->i_wb != old_wb); + if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) + nr_switched++; + } + + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + + up_read(&bdi->wb_switch_rwsem); + + if (nr_switched) { + wb_wakeup(new_wb); + wb_put_many(old_wb, nr_switched); + } - inode_do_switch_wbs(isw->inode, isw->new_wb); - wb_put(isw->new_wb); - iput(isw->inode); + for (inodep = isw->inodes; *inodep; inodep++) + iput(*inodep); + wb_put(new_wb); kfree(isw); atomic_dec(&isw_nr_in_flight); } @@ -503,7 +525,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; - isw = kzalloc(sizeof(*isw), GFP_ATOMIC); + isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC); if (!isw) return; @@ -530,7 +552,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) __iget(inode); spin_unlock(&inode->i_lock); - isw->inode = inode; + isw->inodes[0] = inode; /* * In addition to synchronizing among switchers, I_WB_SWITCH tells diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index e5dc238ebe4f..63f52ad2ce7a 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -240,8 +240,9 @@ static inline void wb_get(struct bdi_writeback *wb) /** * wb_put - decrement a wb's refcount * @wb: bdi_writeback to put + * @nr: number of references to put */ -static inline void wb_put(struct bdi_writeback *wb) +static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) { if (WARN_ON_ONCE(!wb->bdi)) { /* @@ -252,7 +253,16 @@ static inline void wb_put(struct bdi_writeback *wb) } if (wb != &wb->bdi->wb) - percpu_ref_put(&wb->refcnt); + percpu_ref_put_many(&wb->refcnt, nr); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + wb_put_many(wb, 1); } /** @@ -281,6 +291,10 @@ static inline void wb_put(struct bdi_writeback *wb) { } +static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr) +{ +} + static inline bool wb_dying(struct bdi_writeback *wb) { return false; -- cgit From c22d70a162d3cc177282c4487be4d54876ca55c8 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 28 Jun 2021 19:36:03 -0700 Subject: writeback, cgroup: release dying cgwbs by switching attached inodes Asynchronously try to release dying cgwbs by switching attached inodes to the nearest living ancestor wb. It helps to get rid of per-cgroup writeback structures themselves and of pinned memory and block cgroups, which are significantly larger structures (mostly due to large per-cpu statistics data). This prevents memory waste and helps to avoid different scalability problems caused by large piles of dying cgroups. Reuse the existing mechanism of inode switching used for foreign inode detection. To speed things up batch up to 115 inode switching in a single operation (the maximum number is selected so that the resulting struct inode_switch_wbs_context can fit into 1024 bytes). Because every switching consists of two steps divided by an RCU grace period, it would be too slow without batching. Please note that the whole batch counts as a single operation (when increasing/decreasing isw_nr_in_flight). This allows to keep umounting working (flush the switching queue), however prevents cleanups from consuming the whole switching quota and effectively blocking the frn switching. A cgwb cleanup operation can fail due to different reasons (e.g. not enough memory, the cgwb has an in-flight/pending io, an attached inode in a wrong state, etc). In this case the next scheduled cleanup will make a new attempt. An attempt is made each time a new cgwb is offlined (in other words a memcg and/or a blkcg is deleted by a user). In the future an additional attempt scheduled by a timer can be implemented. [guro@fb.com: replace open-coded "115" with arithmetic] Link: https://lkml.kernel.org/r/YMEcSBcq/VXMiPPO@carbon.dhcp.thefacebook.com [guro@fb.com: add smp_mb() to inode_prepare_wbs_switch()] Link: https://lkml.kernel.org/r/YMFa+guFw7OFjf3X@carbon.dhcp.thefacebook.com [willy@infradead.org: fix documentation] Link: https://lkml.kernel.org/r/20210615200242.1716568-2-willy@infradead.org Link: https://lkml.kernel.org/r/20210608230225.2078447-9-guro@fb.com Signed-off-by: Roman Gushchin Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Tejun Heo Acked-by: Dennis Zhou Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Dave Chinner Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 111 +++++++++++++++++++++++++++++++++++---- include/linux/backing-dev-defs.h | 1 + include/linux/writeback.h | 1 + mm/backing-dev.c | 64 +++++++++++++++++++++- 4 files changed, 165 insertions(+), 12 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 737ac27adb77..62193106683d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -225,6 +225,13 @@ void wb_wait_for_completion(struct wb_completion *done) /* one round can affect upto 5 slots */ #define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ +/* + * Maximum inodes per isw. A specific value has been chosen to make + * struct inode_switch_wbs_context fit into 1024 bytes kmalloc. + */ +#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \ + / sizeof(struct inode *)) + static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; @@ -503,6 +510,32 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) atomic_dec(&isw_nr_in_flight); } +static bool inode_prepare_wbs_switch(struct inode *inode, + struct bdi_writeback *new_wb) +{ + /* + * Paired with smp_mb() in cgroup_writeback_umount(). + * isw_nr_in_flight must be increased before checking SB_ACTIVE and + * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 + * in cgroup_writeback_umount() and the isw_wq will be not flushed. + */ + smp_mb(); + + /* while holding I_WB_SWITCH, no one else can update the association */ + spin_lock(&inode->i_lock); + if (!(inode->i_sb->s_flags & SB_ACTIVE) || + inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || + inode_to_wb(inode) == new_wb) { + spin_unlock(&inode->i_lock); + return false; + } + inode->i_state |= I_WB_SWITCH; + __iget(inode); + spin_unlock(&inode->i_lock); + + return true; +} + /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -540,17 +573,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!isw->new_wb) goto out_free; - /* while holding I_WB_SWITCH, no one else can update the association */ - spin_lock(&inode->i_lock); - if (!(inode->i_sb->s_flags & SB_ACTIVE) || - inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || - inode_to_wb(inode) == isw->new_wb) { - spin_unlock(&inode->i_lock); + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) goto out_free; - } - inode->i_state |= I_WB_SWITCH; - __iget(inode); - spin_unlock(&inode->i_lock); isw->inodes[0] = inode; @@ -571,6 +595,73 @@ out_free: kfree(isw); } +/** + * cleanup_offline_cgwb - detach associated inodes + * @wb: target wb + * + * Switch all inodes attached to @wb to a nearest living ancestor's wb in order + * to eventually release the dying @wb. Returns %true if not all inodes were + * switched and the function has to be restarted. + */ +bool cleanup_offline_cgwb(struct bdi_writeback *wb) +{ + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; + struct inode *inode; + int nr; + bool restart = false; + + isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW * + sizeof(struct inode *), GFP_KERNEL); + if (!isw) + return restart; + + atomic_inc(&isw_nr_in_flight); + + for (memcg_css = wb->memcg_css->parent; memcg_css; + memcg_css = memcg_css->parent) { + isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (isw->new_wb) + break; + } + if (unlikely(!isw->new_wb)) + isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + + nr = 0; + spin_lock(&wb->list_lock); + list_for_each_entry(inode, &wb->b_attached, i_io_list) { + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + continue; + + isw->inodes[nr++] = inode; + + if (nr >= WB_MAX_INODES_PER_ISW - 1) { + restart = true; + break; + } + } + spin_unlock(&wb->list_lock); + + /* no attached inodes? bail out */ + if (nr == 0) { + atomic_dec(&isw_nr_in_flight); + wb_put(isw->new_wb); + kfree(isw); + return restart; + } + + /* + * In addition to synchronizing among switchers, I_WB_SWITCH tells + * the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be visible. + */ + INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); + queue_rcu_work(isw_wq, &isw->work); + + return restart; +} + /** * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it * @wbc: writeback_control of interest diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 63f52ad2ce7a..1d7edad9914f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -155,6 +155,7 @@ struct bdi_writeback { struct list_head memcg_node; /* anchored at memcg->cgwb_list */ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ + struct list_head offline_node; /* anchored at offline_cgwbs */ union { struct work_struct release_work; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8e5c5bb16e2d..95de51c10248 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -221,6 +221,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(void); +bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** * inode_attach_wb - associate an inode with its wb diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 54c5dc4b8c24..271f2ca862c8 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -371,12 +371,16 @@ static void wb_exit(struct bdi_writeback *wb) #include /* - * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list. - * bdi->cgwb_tree is also RCU protected. + * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and + * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; +static LIST_HEAD(offline_cgwbs); +static void cleanup_offline_cgwbs_workfn(struct work_struct *work); +static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); + static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, @@ -395,6 +399,11 @@ static void cgwb_release_workfn(struct work_struct *work) fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); + + spin_lock_irq(&cgwb_lock); + list_del(&wb->offline_node); + spin_unlock_irq(&cgwb_lock); + wb_exit(wb); WARN_ON_ONCE(!list_empty(&wb->b_attached)); kfree_rcu(wb, rcu); @@ -414,6 +423,7 @@ static void cgwb_kill(struct bdi_writeback *wb) WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); list_del(&wb->memcg_node); list_del(&wb->blkcg_node); + list_add(&wb->offline_node, &offline_cgwbs); percpu_ref_kill(&wb->refcnt); } @@ -635,6 +645,54 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) mutex_unlock(&bdi->cgwb_release_mutex); } +/* + * cleanup_offline_cgwbs_workfn - try to release dying cgwbs + * + * Try to release dying cgwbs by switching attached inodes to the nearest + * living ancestor's writeback. Processed wbs are placed at the end + * of the list to guarantee the forward progress. + */ +static void cleanup_offline_cgwbs_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb; + LIST_HEAD(processed); + + spin_lock_irq(&cgwb_lock); + + while (!list_empty(&offline_cgwbs)) { + wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, + offline_node); + list_move(&wb->offline_node, &processed); + + /* + * If wb is dirty, cleaning up the writeback by switching + * attached inodes will result in an effective removal of any + * bandwidth restrictions, which isn't the goal. Instead, + * it can be postponed until the next time, when all io + * will be likely completed. If in the meantime some inodes + * will get re-dirtied, they should be eventually switched to + * a new cgwb. + */ + if (wb_has_dirty_io(wb)) + continue; + + if (!wb_tryget(wb)) + continue; + + spin_unlock_irq(&cgwb_lock); + while (cleanup_offline_cgwb(wb)) + cond_resched(); + spin_lock_irq(&cgwb_lock); + + wb_put(wb); + } + + if (!list_empty(&processed)) + list_splice_tail(&processed, &offline_cgwbs); + + spin_unlock_irq(&cgwb_lock); +} + /** * wb_memcg_offline - kill all wb's associated with a memcg being offlined * @memcg: memcg being offlined @@ -651,6 +709,8 @@ void wb_memcg_offline(struct mem_cgroup *memcg) cgwb_kill(wb); memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); + + queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); } /** -- cgit From 34ebcce793245e64db3b40f24486c59668e1f059 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Jun 2021 19:36:06 -0700 Subject: fs: unexport __set_page_dirty Patch series "remove the implicit .set_page_dirty default". This series cleans up a few lose ends around ->set_page_dirty, most importantly removes the default to the buffer head based on if no method is wired up. This patch (of 3): __set_page_dirty is only used by built-in code. Link: https://lkml.kernel.org/r/20210614061512.3966143-1-hch@lst.de Link: https://lkml.kernel.org/r/20210614061512.3966143-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Matthew Wilcox (Oracle) Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index ea48c01fb76b..3d18831c7ad8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -611,7 +611,6 @@ void __set_page_dirty(struct page *page, struct address_space *mapping, } xa_unlock_irqrestore(&mapping->i_pages, flags); } -EXPORT_SYMBOL_GPL(__set_page_dirty); /* * Add a page to the dirty page list. -- cgit From c1e3dbe9818e3caa4e467255a348df56912ca549 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Jun 2021 19:36:09 -0700 Subject: fs: move ramfs_aops to libfs Move the ramfs aops to libfs and reuse them for kernfs and configfs. Thosw two did not wire up ->set_page_dirty before and now get __set_page_dirty_no_writeback, which is the right one for no-writeback address_space usage. Drop the now unused exports of the libfs helpers only used for ramfs-style pagecache usage. Link: https://lkml.kernel.org/r/20210614061512.3966143-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Cc: Al Viro Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/configfs/inode.c | 8 +------- fs/kernfs/inode.c | 8 +------- fs/libfs.c | 17 +++++++++++++---- fs/ramfs/inode.c | 9 +-------- include/linux/fs.h | 5 +---- 5 files changed, 17 insertions(+), 30 deletions(-) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index eb5ec3e46283..b601610e9907 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -28,12 +28,6 @@ static struct lock_class_key default_group_class[MAX_LOCK_DEPTH]; #endif -static const struct address_space_operations configfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, -}; - static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; @@ -114,7 +108,7 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, struct inode * inode = new_inode(s); if (inode) { inode->i_ino = get_next_ino(); - inode->i_mapping->a_ops = &configfs_aops; + inode->i_mapping->a_ops = &ram_aops; inode->i_op = &configfs_inode_operations; if (sd->s_iattr) { diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index d73950fc3d57..26f2aa3586f9 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -17,12 +17,6 @@ #include "kernfs-internal.h" -static const struct address_space_operations kernfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, -}; - static const struct inode_operations kernfs_iops = { .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, @@ -203,7 +197,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) { kernfs_get(kn); inode->i_private = kn; - inode->i_mapping->a_ops = &kernfs_aops; + inode->i_mapping->a_ops = &ram_aops; inode->i_op = &kernfs_iops; inode->i_generation = kernfs_gen(kn); diff --git a/fs/libfs.c b/fs/libfs.c index e9b29c6ffccb..2d7f086b93d6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -512,7 +512,7 @@ int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } EXPORT_SYMBOL(simple_setattr); -int simple_readpage(struct file *file, struct page *page) +static int simple_readpage(struct file *file, struct page *page) { clear_highpage(page); flush_dcache_page(page); @@ -520,7 +520,6 @@ int simple_readpage(struct file *file, struct page *page) unlock_page(page); return 0; } -EXPORT_SYMBOL(simple_readpage); int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -568,7 +567,7 @@ EXPORT_SYMBOL(simple_write_begin); * * Use *ONLY* with simple_readpage() */ -int simple_write_end(struct file *file, struct address_space *mapping, +static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { @@ -597,7 +596,17 @@ int simple_write_end(struct file *file, struct address_space *mapping, return copied; } -EXPORT_SYMBOL(simple_write_end); + +/* + * Provides ramfs-style behavior: data in the pagecache, but no writeback. + */ +const struct address_space_operations ram_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, + .set_page_dirty = __set_page_dirty_no_writeback, +}; +EXPORT_SYMBOL(ram_aops); /* * the inodes created here are not hashed. If you use iunique to generate diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 9ebd17d7befb..65e7e56005b8 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -53,13 +53,6 @@ struct ramfs_fs_info { static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; -static const struct address_space_operations ramfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, - .set_page_dirty = __set_page_dirty_no_writeback, -}; - struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev) { @@ -68,7 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(&init_user_ns, inode, dir, mode); - inode->i_mapping->a_ops = &ramfs_aops; + inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index c3c88fdb9b2a..869909345420 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3422,13 +3422,10 @@ extern void noop_invalidatepage(struct page *page, unsigned int offset, unsigned int length); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); -extern int simple_readpage(struct file *file, struct page *page); extern int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); -extern int simple_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); +extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); extern int simple_nosetlease(struct file *, long, struct file_lock **, void **); -- cgit From 0af573780b0b13fceb7fabd49dc1b073cee9a507 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 28 Jun 2021 19:36:12 -0700 Subject: mm: require ->set_page_dirty to be explicitly wired up Remove the CONFIG_BLOCK default to __set_page_dirty_buffers and just wire that method up for the missing instances. [hch@lst.de: ecryptfs: add a ->set_page_dirty cludge] Link: https://lkml.kernel.org/r/20210624125250.536369-1-hch@lst.de Link: https://lkml.kernel.org/r/20210614061512.3966143-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Cc: Al Viro Cc: Matthew Wilcox (Oracle) Cc: Tyler Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/adfs/inode.c | 1 + fs/affs/file.c | 2 ++ fs/bfs/file.c | 1 + fs/block_dev.c | 1 + fs/ecryptfs/mmap.c | 13 +++++++++++++ fs/exfat/inode.c | 1 + fs/ext2/inode.c | 2 ++ fs/fat/inode.c | 1 + fs/gfs2/meta_io.c | 2 ++ fs/hfs/inode.c | 2 ++ fs/hfsplus/inode.c | 2 ++ fs/hpfs/file.c | 1 + fs/jfs/inode.c | 1 + fs/minix/inode.c | 1 + fs/nilfs2/mdt.c | 1 + fs/ocfs2/aops.c | 1 + fs/omfs/file.c | 1 + fs/sysv/itree.c | 1 + fs/udf/file.c | 1 + fs/udf/inode.c | 1 + fs/ufs/inode.c | 1 + mm/page-writeback.c | 18 ++++-------------- 22 files changed, 42 insertions(+), 14 deletions(-) diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index fb7ee026d101..adbb3a1edcbf 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -73,6 +73,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations adfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = adfs_readpage, .writepage = adfs_writepage, .write_begin = adfs_write_begin, diff --git a/fs/affs/file.c b/fs/affs/file.c index d91b0133d95d..75ebd2b576ca 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -453,6 +453,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations affs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = affs_readpage, .writepage = affs_writepage, .write_begin = affs_write_begin, @@ -833,6 +834,7 @@ err_bh: } const struct address_space_operations affs_aops_ofs = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = affs_readpage_ofs, //.writepage = affs_writepage_ofs, .write_begin = affs_write_begin_ofs, diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 0dceefc54b48..7f8544abf636 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -188,6 +188,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations bfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = bfs_readpage, .writepage = bfs_writepage, .write_begin = bfs_write_begin, diff --git a/fs/block_dev.c b/fs/block_dev.c index 6cc4d4cfe0c2..eb34f5c357cf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1754,6 +1754,7 @@ static int blkdev_writepages(struct address_space *mapping, } static const struct address_space_operations def_blk_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = blkdev_readpage, .readahead = blkdev_readahead, .writepage = blkdev_writepage, diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 392e721b50a3..7d85e64ea62f 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -533,7 +533,20 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) return block; } +#include + const struct address_space_operations ecryptfs_aops = { + /* + * XXX: This is pretty broken for multiple reasons: ecryptfs does not + * actually use buffer_heads, and ecryptfs will crash without + * CONFIG_BLOCK. But it matches the behavior before the default for + * address_space_operations without the ->set_page_dirty method was + * cleaned up, so this is the best we can do without maintainer + * feedback. + */ +#ifdef CONFIG_BLOCK + .set_page_dirty = __set_page_dirty_buffers, +#endif .writepage = ecryptfs_writepage, .readpage = ecryptfs_readpage, .write_begin = ecryptfs_write_begin, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 1803ef3220fd..ca37d4344361 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -491,6 +491,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from) } static const struct address_space_operations exfat_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = exfat_readpage, .readahead = exfat_readahead, .writepage = exfat_writepage, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 68178b2234bd..bf41f579ed3e 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -961,6 +961,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc } const struct address_space_operations ext2_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = ext2_readpage, .readahead = ext2_readahead, .writepage = ext2_writepage, @@ -975,6 +976,7 @@ const struct address_space_operations ext2_aops = { }; const struct address_space_operations ext2_nobh_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = ext2_readpage, .readahead = ext2_readahead, .writepage = ext2_nobh_writepage, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index bab9b202b496..de0c9b013a85 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -342,6 +342,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from) } static const struct address_space_operations fat_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = fat_readpage, .readahead = fat_readahead, .writepage = fat_writepage, diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index d68184ebbfdd..7c9619997355 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -89,11 +89,13 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb } const struct address_space_operations gfs2_meta_aops = { + .set_page_dirty = __set_page_dirty_buffers, .writepage = gfs2_aspace_writepage, .releasepage = gfs2_releasepage, }; const struct address_space_operations gfs2_rgrp_aops = { + .set_page_dirty = __set_page_dirty_buffers, .writepage = gfs2_aspace_writepage, .releasepage = gfs2_releasepage, }; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 3fc5cb346586..4a95a92546a0 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -159,6 +159,7 @@ static int hfs_writepages(struct address_space *mapping, } const struct address_space_operations hfs_btree_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = hfs_readpage, .writepage = hfs_writepage, .write_begin = hfs_write_begin, @@ -168,6 +169,7 @@ const struct address_space_operations hfs_btree_aops = { }; const struct address_space_operations hfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = hfs_readpage, .writepage = hfs_writepage, .write_begin = hfs_write_begin, diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 8ea447e5c470..70e8374ddac4 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -156,6 +156,7 @@ static int hfsplus_writepages(struct address_space *mapping, } const struct address_space_operations hfsplus_btree_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = hfsplus_readpage, .writepage = hfsplus_writepage, .write_begin = hfsplus_write_begin, @@ -165,6 +166,7 @@ const struct address_space_operations hfsplus_btree_aops = { }; const struct address_space_operations hfsplus_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = hfsplus_readpage, .writepage = hfsplus_writepage, .write_begin = hfsplus_write_begin, diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 077c25128eb7..c3a49aacf20a 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -196,6 +196,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } const struct address_space_operations hpfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = hpfs_readpage, .writepage = hpfs_writepage, .readahead = hpfs_readahead, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 6f65bfa9f18d..3663dd5a23bc 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -356,6 +356,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } const struct address_space_operations jfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = jfs_readpage, .readahead = jfs_readahead, .writepage = jfs_writepage, diff --git a/fs/minix/inode.c b/fs/minix/inode.c index a532a99bbe81..a71f1cf894b9 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -442,6 +442,7 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations minix_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = minix_readpage, .writepage = minix_writepage, .write_begin = minix_write_begin, diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index c0361ce45f62..97769fe4d588 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -434,6 +434,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) static const struct address_space_operations def_mdt_aops = { + .set_page_dirty = __set_page_dirty_buffers, .writepage = nilfs_mdt_write_page, }; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e1c6fa5bd0e7..68d11c295dd3 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2453,6 +2453,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } const struct address_space_operations ocfs2_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = ocfs2_readpage, .readahead = ocfs2_readahead, .writepage = ocfs2_writepage, diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 11e733aab25d..89725b15a64b 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -372,6 +372,7 @@ const struct inode_operations omfs_file_inops = { }; const struct address_space_operations omfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = omfs_readpage, .readahead = omfs_readahead, .writepage = omfs_writepage, diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 8b2e99b7bc9f..749385015a8d 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -495,6 +495,7 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations sysv_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = sysv_readpage, .writepage = sysv_writepage, .write_begin = sysv_write_begin, diff --git a/fs/udf/file.c b/fs/udf/file.c index 2846dcd92197..1baff8ddb754 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -125,6 +125,7 @@ static int udf_adinicb_write_end(struct file *file, struct address_space *mappin } const struct address_space_operations udf_adinicb_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = udf_adinicb_readpage, .writepage = udf_adinicb_writepage, .write_begin = udf_adinicb_write_begin, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 0dd2f93ac048..4917670860a0 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -235,6 +235,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations udf_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = udf_readpage, .readahead = udf_readahead, .writepage = udf_writepage, diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index debc282c1bb4..ac628de69601 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -526,6 +526,7 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations ufs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = ufs_readpage, .writepage = ufs_writepage, .write_begin = ufs_write_begin, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c2a849d653a9..21f4b5972311 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -32,7 +32,6 @@ #include #include #include -#include /* __set_page_dirty_buffers */ #include #include #include @@ -2555,13 +2554,9 @@ EXPORT_SYMBOL(redirty_page_for_writepage); /* * Dirty a page. * - * For pages with a mapping this should be done under the page lock - * for the benefit of asynchronous memory errors who prefer a consistent - * dirty state. This rule can be broken in some special cases, - * but should be better not to. - * - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. + * For pages with a mapping this should be done under the page lock for the + * benefit of asynchronous memory errors who prefer a consistent dirty state. + * This rule can be broken in some special cases, but should be better not to. */ int set_page_dirty(struct page *page) { @@ -2569,7 +2564,6 @@ int set_page_dirty(struct page *page) page = compound_head(page); if (likely(mapping)) { - int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; /* * readahead/lru_deactivate_page could remain * PG_readahead/PG_reclaim due to race with end_page_writeback @@ -2582,11 +2576,7 @@ int set_page_dirty(struct page *page) */ if (PageReclaim(page)) ClearPageReclaim(page); -#ifdef CONFIG_BLOCK - if (!spd) - spd = __set_page_dirty_buffers; -#endif - return (*spd)(page); + return mapping->a_ops->set_page_dirty(page); } if (!PageDirty(page)) { if (!TestSetPageDirty(page)) -- cgit From 6e1cae881a0646f31fe2bda90297d820da1137eb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:36:15 -0700 Subject: mm/writeback: move __set_page_dirty() to core mm Patch series "Further set_page_dirty cleanups". Prompted by Christoph's recent patches, here are some more patches to improve the state of set_page_dirty(). They're all from the folio tree, so they've been tested to a certain extent. This patch (of 6): Nothing in __set_page_dirty() is specific to buffer_head, so move it to mm/page-writeback.c. That removes the only caller of account_page_dirtied() outside of page-writeback.c, so make it static. Link: https://lkml.kernel.org/r/20210615162342.1669332-1-willy@infradead.org Link: https://lkml.kernel.org/r/20210615162342.1669332-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Cc: Jan Kara Cc: Al Viro Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 24 ------------------------ include/linux/mm.h | 1 - mm/page-writeback.c | 27 ++++++++++++++++++++++++++- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 3d18831c7ad8..6290c3afdba4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -588,30 +588,6 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) } EXPORT_SYMBOL(mark_buffer_dirty_inode); -/* - * Mark the page dirty, and set it dirty in the page cache, and mark the inode - * dirty. - * - * If warn is true, then emit a warning if the page is not uptodate and has - * not been truncated. - * - * The caller must hold lock_page_memcg(). - */ -void __set_page_dirty(struct page *page, struct address_space *mapping, - int warn) -{ - unsigned long flags; - - xa_lock_irqsave(&mapping->i_pages, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping); - __xa_set_mark(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); - } - xa_unlock_irqrestore(&mapping->i_pages, flags); -} - /* * Add a page to the dirty page list. * diff --git a/include/linux/mm.h b/include/linux/mm.h index 9afb8998e7e5..12589b811555 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1855,7 +1855,6 @@ int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); -void account_page_dirtied(struct page *page, struct address_space *mapping); void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb); int set_page_dirty(struct page *page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 21f4b5972311..1345882c428b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2425,7 +2425,8 @@ int __set_page_dirty_no_writeback(struct page *page) * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping) +static void account_page_dirtied(struct page *page, + struct address_space *mapping) { struct inode *inode = mapping->host; @@ -2466,6 +2467,30 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, } } +/* + * Mark the page dirty, and set it dirty in the page cache, and mark the inode + * dirty. + * + * If warn is true, then emit a warning if the page is not uptodate and has + * not been truncated. + * + * The caller must hold lock_page_memcg(). + */ +void __set_page_dirty(struct page *page, struct address_space *mapping, + int warn) +{ + unsigned long flags; + + xa_lock_irqsave(&mapping->i_pages, flags); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(warn && !PageUptodate(page)); + account_page_dirtied(page, mapping); + __xa_set_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); + } + xa_unlock_irqrestore(&mapping->i_pages, flags); +} + /* * For address_spaces which do not use buffers. Just tag the page as dirty in * the xarray. -- cgit From 2f18be363c3332dedaabb9fc050a282a00f4f646 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:36:18 -0700 Subject: mm/writeback: use __set_page_dirty in __set_page_dirty_nobuffers This is fundamentally the same code, so just call it instead of duplicating it. Link: https://lkml.kernel.org/r/20210615162342.1669332-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Cc: Al Viro Cc: Dan Williams Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1345882c428b..8bd69dc5379a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2508,20 +2508,12 @@ int __set_page_dirty_nobuffers(struct page *page) lock_page_memcg(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); - unsigned long flags; if (!mapping) { unlock_page_memcg(page); return 1; } - - xa_lock_irqsave(&mapping->i_pages, flags); - BUG_ON(page_mapping(page) != mapping); - WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); - __xa_set_mark(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); - xa_unlock_irqrestore(&mapping->i_pages, flags); + __set_page_dirty(page, mapping, !PagePrivate(page)); unlock_page_memcg(page); if (mapping->host) { -- cgit From fd7353f88bde80d557b6d74a5351979fc8b1b8db Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:36:21 -0700 Subject: iomap: use __set_page_dirty_nobuffers The only difference between iomap_set_page_dirty() and __set_page_dirty_nobuffers() is that the latter includes a debugging check that a !Uptodate page has private data. Link: https://lkml.kernel.org/r/20210615162342.1669332-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/gfs2/aops.c | 2 +- fs/iomap/buffered-io.c | 27 +-------------------------- fs/xfs/xfs_aops.c | 2 +- fs/zonefs/super.c | 2 +- include/linux/iomap.h | 1 - 5 files changed, 4 insertions(+), 30 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 23b5be3db044..81d8f064126e 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -784,7 +784,7 @@ static const struct address_space_operations gfs2_aops = { .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readahead = gfs2_readahead, - .set_page_dirty = iomap_set_page_dirty, + .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = iomap_releasepage, .invalidatepage = iomap_invalidatepage, .bmap = gfs2_bmap, diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 9023717c5188..0065781935c7 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -640,31 +640,6 @@ out_no_page: return status; } -int -iomap_set_page_dirty(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - int newly_dirty; - - if (unlikely(!mapping)) - return !TestSetPageDirty(page); - - /* - * Lock out page's memcg migration to keep PageDirty - * synchronized with per-memcg dirty page counters. - */ - lock_page_memcg(page); - newly_dirty = !TestSetPageDirty(page); - if (newly_dirty) - __set_page_dirty(page, mapping, 0); - unlock_page_memcg(page); - - if (newly_dirty) - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return newly_dirty; -} -EXPORT_SYMBOL_GPL(iomap_set_page_dirty); - static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct page *page) { @@ -684,7 +659,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, if (unlikely(copied < len && !PageUptodate(page))) return 0; iomap_set_range_uptodate(page, offset_in_page(pos), len); - iomap_set_page_dirty(page); + __set_page_dirty_nobuffers(page); return copied; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 826caa6b4a5a..a335d79dcff8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -561,7 +561,7 @@ const struct address_space_operations xfs_address_space_operations = { .readahead = xfs_vm_readahead, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, - .set_page_dirty = iomap_set_page_dirty, + .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = iomap_releasepage, .invalidatepage = iomap_invalidatepage, .bmap = xfs_vm_bmap, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index cd145d318b17..3aacf016c7c2 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -185,7 +185,7 @@ static const struct address_space_operations zonefs_file_aops = { .readahead = zonefs_readahead, .writepage = zonefs_writepage, .writepages = zonefs_writepages, - .set_page_dirty = iomap_set_page_dirty, + .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = iomap_releasepage, .invalidatepage = iomap_invalidatepage, .migratepage = iomap_migrate_page, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index c87d0cb0de6d..479c1da3e221 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -159,7 +159,6 @@ ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_readpage(struct page *page, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); -int iomap_set_page_dirty(struct page *page); int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count); int iomap_releasepage(struct page *page, gfp_t gfp_mask); -- cgit From fc50eee3291556d623b64bb4b1dc345b971e184e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:36:24 -0700 Subject: fs: remove anon_set_page_dirty() Use __set_page_dirty_no_writeback() instead. This will set the dirty bit on the page, which will be used to avoid calling set_page_dirty() in the future. It will have no effect on actually writing the page back, as the pages are not on any LRU lists. Link: https://lkml.kernel.org/r/20210615162342.1669332-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/libfs.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 2d7f086b93d6..3fdd89b156d6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1217,19 +1217,10 @@ void kfree_link(void *p) } EXPORT_SYMBOL(kfree_link); -/* - * nop .set_page_dirty method so that people can use .page_mkwrite on - * anon inodes. - */ -static int anon_set_page_dirty(struct page *page) -{ - return 0; -}; - struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { - .set_page_dirty = anon_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, }; struct inode *inode = new_inode_pseudo(s); -- cgit From b82a96c9253333a8834b2df5f262a39cccf4f6c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:36:27 -0700 Subject: fs: remove noop_set_page_dirty() Use __set_page_dirty_no_writeback() instead. This will set the dirty bit on the page, which will be used to avoid calling set_page_dirty() in the future. It will have no effect on actually writing the page back, as the pages are not on any LRU lists. [akpm@linux-foundation.org: export __set_page_dirty_no_writeback() to modules] Link: https://lkml.kernel.org/r/20210615162342.1669332-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Al Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 2 +- fs/ext2/inode.c | 2 +- fs/ext4/inode.c | 2 +- fs/fuse/dax.c | 2 +- fs/libfs.c | 16 ---------------- fs/xfs/xfs_aops.c | 2 +- include/linux/fs.h | 1 - mm/page-writeback.c | 1 + 8 files changed, 6 insertions(+), 22 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index db92573c94e8..dd8222a42808 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -337,7 +337,7 @@ static unsigned long dax_get_unmapped_area(struct file *filp, } static const struct address_space_operations dev_dax_aops = { - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .invalidatepage = noop_invalidatepage, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index bf41f579ed3e..dadb121beb22 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -992,7 +992,7 @@ const struct address_space_operations ext2_nobh_aops = { static const struct address_space_operations ext2_dax_aops = { .writepages = ext2_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .invalidatepage = noop_invalidatepage, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fe6045a46599..b8170a008590 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3701,7 +3701,7 @@ static const struct address_space_operations ext4_da_aops = { static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .bmap = ext4_bmap, .invalidatepage = noop_invalidatepage, .swap_activate = ext4_iomap_swap_activate, diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index ff99ab2a3c43..515ad0895345 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1329,7 +1329,7 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) static const struct address_space_operations fuse_dax_file_aops = { .writepages = fuse_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .invalidatepage = noop_invalidatepage, }; diff --git a/fs/libfs.c b/fs/libfs.c index 3fdd89b156d6..51b4de3b3447 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1171,22 +1171,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); -int noop_set_page_dirty(struct page *page) -{ - /* - * Unlike __set_page_dirty_no_writeback that handles dirty page - * tracking in the page object, dax does all dirty tracking in - * the inode address_space in response to mkwrite faults. In the - * dax case we only need to worry about potentially dirty CPU - * caches, not dirty page cache pages to write back. - * - * This callback is defined to prevent fallback to - * __set_page_dirty_buffers() in set_page_dirty(). - */ - return 0; -} -EXPORT_SYMBOL_GPL(noop_set_page_dirty); - void noop_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a335d79dcff8..cb4e0fcf4c76 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -575,7 +575,7 @@ const struct address_space_operations xfs_address_space_operations = { const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .invalidatepage = noop_invalidatepage, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 869909345420..fad6663cd1b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3417,7 +3417,6 @@ extern int simple_rename(struct user_namespace *, struct inode *, extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); -extern int noop_set_page_dirty(struct page *page); extern void noop_invalidatepage(struct page *page, unsigned int offset, unsigned int length); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8bd69dc5379a..e5b38ffe9fca 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2417,6 +2417,7 @@ int __set_page_dirty_no_writeback(struct page *page) return !TestSetPageDirty(page); return 0; } +EXPORT_SYMBOL(__set_page_dirty_no_writeback); /* * Helper function for set_page_dirty family. -- cgit From 3a6b2162005f24c7caa10d7f10dba487629787f2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:36:30 -0700 Subject: mm: move page dirtying prototypes from mm.h These functions implement the address_space ->set_page_dirty operation and should live in pagemap.h, not mm.h so that the rest of the kernel doesn't get funny ideas about calling them directly. Link: https://lkml.kernel.org/r/20210615162342.1669332-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dax.c | 1 + fs/zonefs/super.c | 2 +- include/linux/mm.h | 3 --- include/linux/pagemap.h | 4 ++++ 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 515ad0895345..fb733eb5aead 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 3aacf016c7c2..dbf03635869c 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -5,7 +5,7 @@ * Copyright (C) 2019 Western Digital Corporation or its affiliates. */ #include -#include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index 12589b811555..e39ed497578b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1850,9 +1850,6 @@ extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length); -void __set_page_dirty(struct page *, struct address_space *, int warn); -int __set_page_dirty_nobuffers(struct page *page); -int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_cleaned(struct page *page, struct address_space *mapping, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0f1b34dbf3a2..ed02aa522263 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -702,6 +702,10 @@ int wait_on_page_writeback_killable(struct page *page); extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); +void __set_page_dirty(struct page *, struct address_space *, int warn); +int __set_page_dirty_nobuffers(struct page *page); +int __set_page_dirty_no_writeback(struct page *page); + void page_endio(struct page *page, bool is_write, int err); /** -- cgit From f39bd8534594535f6fd968ee7e05d6a70b74d1a9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 28 Jun 2021 19:36:33 -0700 Subject: mm/gup_benchmark: support threading Patch series "mm/gup: Fix pin page write cache bouncing on has_pinned", v2. This series contains 3 patches, the 1st one enables threading for gup_benchmark in the kselftest. The latter two patches are collected from Andrea's local branch which can fix write cache bouncing issue with pinning fast-gup. To be explicit on the latter two patches: - the 2nd patch fixes the perf degrade when introducing has_pinned, then - the last patch tries to remove the has_pinned with a bit in mm->flags For patch 3: originally I think we had a plan to reuse has_pinned into a counter very soon, however that's not happening at least until today, so maybe it proves that we can remove it until we really want such a counter for whatever reason. As the commit message stated, it saves 4 bytes for each mm without observable regressions. Regarding testing: we can reference to the commit message of patch 2 for some detailed testing with will-is-scale. Meanwhile I did patch 1 just because then we can even easily verify the patchset using the existing kselftest facilities or even regress test it in the future with the repo if we want. Below numbers are extra verification tests that I did besides commit message of patch 2 using the new gup_benchmark and 256 cpus. Below test is done on 40 cpus host with Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz, and I can get similar result (of course the write cache bouncing get severe with even more cores). After patch 1 applied (only test patch, so using old kernel): $ sudo chrt -f 1 ./gup_test -a -m 512 -j 40 PIN_FAST_BENCHMARK: Time: get:459632 put:5990 us PIN_FAST_BENCHMARK: Time: get:461967 put:5840 us PIN_FAST_BENCHMARK: Time: get:464521 put:6140 us PIN_FAST_BENCHMARK: Time: get:465176 put:7100 us PIN_FAST_BENCHMARK: Time: get:465960 put:6733 us PIN_FAST_BENCHMARK: Time: get:465324 put:6781 us PIN_FAST_BENCHMARK: Time: get:466018 put:7130 us PIN_FAST_BENCHMARK: Time: get:466362 put:7118 us PIN_FAST_BENCHMARK: Time: get:465118 put:6975 us PIN_FAST_BENCHMARK: Time: get:466422 put:6602 us PIN_FAST_BENCHMARK: Time: get:465791 put:6818 us PIN_FAST_BENCHMARK: Time: get:467091 put:6298 us PIN_FAST_BENCHMARK: Time: get:467694 put:5432 us PIN_FAST_BENCHMARK: Time: get:469575 put:5581 us PIN_FAST_BENCHMARK: Time: get:468124 put:6055 us PIN_FAST_BENCHMARK: Time: get:468877 put:6720 us PIN_FAST_BENCHMARK: Time: get:467212 put:4961 us PIN_FAST_BENCHMARK: Time: get:467834 put:6697 us PIN_FAST_BENCHMARK: Time: get:470778 put:6398 us PIN_FAST_BENCHMARK: Time: get:469788 put:6310 us PIN_FAST_BENCHMARK: Time: get:488277 put:7113 us PIN_FAST_BENCHMARK: Time: get:486613 put:7085 us PIN_FAST_BENCHMARK: Time: get:486940 put:7202 us PIN_FAST_BENCHMARK: Time: get:488728 put:7101 us PIN_FAST_BENCHMARK: Time: get:487570 put:7327 us PIN_FAST_BENCHMARK: Time: get:489260 put:7027 us PIN_FAST_BENCHMARK: Time: get:488846 put:6866 us PIN_FAST_BENCHMARK: Time: get:488521 put:6745 us PIN_FAST_BENCHMARK: Time: get:489950 put:6459 us PIN_FAST_BENCHMARK: Time: get:489777 put:6617 us PIN_FAST_BENCHMARK: Time: get:488224 put:6591 us PIN_FAST_BENCHMARK: Time: get:488644 put:6477 us PIN_FAST_BENCHMARK: Time: get:488754 put:6711 us PIN_FAST_BENCHMARK: Time: get:488875 put:6743 us PIN_FAST_BENCHMARK: Time: get:489290 put:6657 us PIN_FAST_BENCHMARK: Time: get:490264 put:6684 us PIN_FAST_BENCHMARK: Time: get:489631 put:6737 us PIN_FAST_BENCHMARK: Time: get:488434 put:6655 us PIN_FAST_BENCHMARK: Time: get:492213 put:6297 us PIN_FAST_BENCHMARK: Time: get:491124 put:6173 us After the whole series applied (new fixed kernel): $ sudo chrt -f 1 ./gup_test -a -m 512 -j 40 PIN_FAST_BENCHMARK: Time: get:82038 put:7041 us PIN_FAST_BENCHMARK: Time: get:82144 put:6817 us PIN_FAST_BENCHMARK: Time: get:83417 put:6674 us PIN_FAST_BENCHMARK: Time: get:82540 put:6594 us PIN_FAST_BENCHMARK: Time: get:83214 put:6681 us PIN_FAST_BENCHMARK: Time: get:83444 put:6889 us PIN_FAST_BENCHMARK: Time: get:83194 put:7499 us PIN_FAST_BENCHMARK: Time: get:84876 put:7369 us PIN_FAST_BENCHMARK: Time: get:86092 put:10289 us PIN_FAST_BENCHMARK: Time: get:86153 put:10415 us PIN_FAST_BENCHMARK: Time: get:85026 put:7751 us PIN_FAST_BENCHMARK: Time: get:85458 put:7944 us PIN_FAST_BENCHMARK: Time: get:85735 put:8154 us PIN_FAST_BENCHMARK: Time: get:85851 put:8299 us PIN_FAST_BENCHMARK: Time: get:86323 put:9617 us PIN_FAST_BENCHMARK: Time: get:86288 put:10496 us PIN_FAST_BENCHMARK: Time: get:87697 put:9346 us PIN_FAST_BENCHMARK: Time: get:87980 put:8382 us PIN_FAST_BENCHMARK: Time: get:88719 put:8400 us PIN_FAST_BENCHMARK: Time: get:87616 put:8588 us PIN_FAST_BENCHMARK: Time: get:86730 put:9563 us PIN_FAST_BENCHMARK: Time: get:88167 put:8673 us PIN_FAST_BENCHMARK: Time: get:86844 put:9777 us PIN_FAST_BENCHMARK: Time: get:88068 put:11774 us PIN_FAST_BENCHMARK: Time: get:86170 put:15676 us PIN_FAST_BENCHMARK: Time: get:87967 put:12827 us PIN_FAST_BENCHMARK: Time: get:95773 put:7652 us PIN_FAST_BENCHMARK: Time: get:87734 put:13650 us PIN_FAST_BENCHMARK: Time: get:89833 put:14237 us PIN_FAST_BENCHMARK: Time: get:96186 put:8029 us PIN_FAST_BENCHMARK: Time: get:95532 put:8886 us PIN_FAST_BENCHMARK: Time: get:95351 put:5826 us PIN_FAST_BENCHMARK: Time: get:96401 put:8407 us PIN_FAST_BENCHMARK: Time: get:96473 put:8287 us PIN_FAST_BENCHMARK: Time: get:97177 put:8430 us PIN_FAST_BENCHMARK: Time: get:98120 put:5263 us PIN_FAST_BENCHMARK: Time: get:96271 put:7757 us PIN_FAST_BENCHMARK: Time: get:99628 put:10467 us PIN_FAST_BENCHMARK: Time: get:99344 put:10045 us PIN_FAST_BENCHMARK: Time: get:94212 put:15485 us Summary: Old kernel: 477729.97 (+-3.79%) New kernel: 89144.65 (+-11.76%) This patch (of 3): Add a new parameter "-j N" to support concurrent gup test. Link: https://lkml.kernel.org/r/20210507150553.208763-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210507150553.208763-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: John Hubbard Cc: Jan Kara Cc: Michal Hocko Cc: Kirill Tkhai Cc: Kirill Shutemov Cc: Oleg Nesterov Cc: Jann Horn Cc: Andrea Arcangeli Cc: Jason Gunthorpe Cc: Matthew Wilcox Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/gup_test.c | 96 ++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 1e662d59c502..fe043f67798b 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include "../../../../mm/gup_test.h" #define MB (1UL << 20) @@ -15,6 +17,12 @@ #define FOLL_WRITE 0x01 /* check pte is writable */ #define FOLL_TOUCH 0x02 /* mark page accessed */ +static unsigned long cmd = GUP_FAST_BENCHMARK; +static int gup_fd, repeats = 1; +static unsigned long size = 128 * MB; +/* Serialize prints */ +static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER; + static char *cmd_to_str(unsigned long cmd) { switch (cmd) { @@ -34,17 +42,55 @@ static char *cmd_to_str(unsigned long cmd) return "Unknown command"; } +void *gup_thread(void *data) +{ + struct gup_test gup = *(struct gup_test *)data; + int i; + + /* Only report timing information on the *_BENCHMARK commands: */ + if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || + (cmd == PIN_LONGTERM_BENCHMARK)) { + for (i = 0; i < repeats; i++) { + gup.size = size; + if (ioctl(gup_fd, cmd, &gup)) + perror("ioctl"), exit(1); + + pthread_mutex_lock(&print_mutex); + printf("%s: Time: get:%lld put:%lld us", + cmd_to_str(cmd), gup.get_delta_usec, + gup.put_delta_usec); + if (gup.size != size) + printf(", truncated (size: %lld)", gup.size); + printf("\n"); + pthread_mutex_unlock(&print_mutex); + } + } else { + gup.size = size; + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl"); + exit(1); + } + + pthread_mutex_lock(&print_mutex); + printf("%s: done\n", cmd_to_str(cmd)); + if (gup.size != size) + printf("Truncated (size: %lld)\n", gup.size); + pthread_mutex_unlock(&print_mutex); + } + + return NULL; +} + int main(int argc, char **argv) { struct gup_test gup = { 0 }; - unsigned long size = 128 * MB; - int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 1; - unsigned long cmd = GUP_FAST_BENCHMARK; + int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; int flags = MAP_PRIVATE, touch = 0; char *file = "/dev/zero"; + pthread_t *tid; char *p; - while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHpz")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) { switch (opt) { case 'a': cmd = PIN_FAST_BENCHMARK; @@ -74,6 +120,9 @@ int main(int argc, char **argv) /* strtol, so you can pass flags in hex form */ gup.gup_flags = strtol(optarg, 0, 0); break; + case 'j': + nthreads = atoi(optarg); + break; case 'm': size = atoi(optarg) * MB; break; @@ -154,8 +203,8 @@ int main(int argc, char **argv) if (write) gup.gup_flags |= FOLL_WRITE; - fd = open("/sys/kernel/debug/gup_test", O_RDWR); - if (fd == -1) { + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (gup_fd == -1) { perror("open"); exit(1); } @@ -185,32 +234,17 @@ int main(int argc, char **argv) p[0] = 0; } - /* Only report timing information on the *_BENCHMARK commands: */ - if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || - (cmd == PIN_LONGTERM_BENCHMARK)) { - for (i = 0; i < repeats; i++) { - gup.size = size; - if (ioctl(fd, cmd, &gup)) - perror("ioctl"), exit(1); - - printf("%s: Time: get:%lld put:%lld us", - cmd_to_str(cmd), gup.get_delta_usec, - gup.put_delta_usec); - if (gup.size != size) - printf(", truncated (size: %lld)", gup.size); - printf("\n"); - } - } else { - gup.size = size; - if (ioctl(fd, cmd, &gup)) { - perror("ioctl"); - exit(1); - } - - printf("%s: done\n", cmd_to_str(cmd)); - if (gup.size != size) - printf("Truncated (size: %lld)\n", gup.size); + tid = malloc(sizeof(pthread_t) * nthreads); + assert(tid); + for (i = 0; i < nthreads; i++) { + ret = pthread_create(&tid[i], NULL, gup_thread, &gup); + assert(ret == 0); + } + for (i = 0; i < nthreads; i++) { + ret = pthread_join(tid[i], NULL); + assert(ret == 0); } + free(tid); return 0; } -- cgit From 292648ac5cf16ec1fce33e29e0f9e35da7de63f7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jun 2021 19:36:36 -0700 Subject: mm: gup: allow FOLL_PIN to scale in SMP has_pinned cannot be written by each pin-fast or it won't scale in SMP. This isn't "false sharing" strictly speaking (it's more like "true non-sharing"), but it creates the same SMP scalability bottleneck of "false sharing". To verify the improvement, below test is done on 40 cpus host with Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz (must be with CONFIG_GUP_TEST=y): $ sudo chrt -f 1 ./gup_test -a -m 512 -j 40 Where we can get (average value for 40 threads): Old kernel: 477729.97 (+- 3.79%) New kernel: 89144.65 (+-11.76%) On a similar condition with 256 cpus, this commits increases the SMP scalability of pin_user_pages_fast() executed by different threads of the same process by more than 4000%. [peterx@redhat.com: rewrite commit message, add parentheses against "(A & B)"] Link: https://lkml.kernel.org/r/20210507150553.208763-3-peterx@redhat.com Signed-off-by: Andrea Arcangeli Signed-off-by: Peter Xu Reviewed-by: John Hubbard Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill Shutemov Cc: Kirill Tkhai Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 90262e448552..a6c20a7b3c49 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1320,7 +1320,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, BUG_ON(*locked != 1); } - if (flags & FOLL_PIN) + if ((flags & FOLL_PIN) && !atomic_read(&mm->has_pinned)) atomic_set(&mm->has_pinned, 1); /* @@ -2641,7 +2641,7 @@ static int internal_get_user_pages_fast(unsigned long start, FOLL_FAST_ONLY))) return -EINVAL; - if (gup_flags & FOLL_PIN) + if ((gup_flags & FOLL_PIN) && !atomic_read(¤t->mm->has_pinned)) atomic_set(¤t->mm->has_pinned, 1); if (!(gup_flags & FOLL_FAST_ONLY)) -- cgit From a458b76a4171f893efa7657dc079924580a8746a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jun 2021 19:36:40 -0700 Subject: mm: gup: pack has_pinned in MMF_HAS_PINNED has_pinned 32bit can be packed in the MMF_HAS_PINNED bit as a noop cleanup. Any atomic_inc/dec to the mm cacheline shared by all threads in pin-fast would reintroduce a loss of SMP scalability to pin-fast, so there's no future potential usefulness to keep an atomic in the mm for this. set_bit(MMF_HAS_PINNED) will be theoretically a bit slower than WRITE_ONCE (atomic_set is equivalent to WRITE_ONCE), but the set_bit (just like atomic_set after this commit) has to be still issued only once per "mm", so the difference between the two will be lost in the noise. will-it-scale "mmap2" shows no change in performance with enterprise config as expected. will-it-scale "pin_fast" retains the > 4000% SMP scalability performance improvement against upstream as expected. This is a noop as far as overall performance and SMP scalability are concerned. [peterx@redhat.com: pack has_pinned in MMF_HAS_PINNED] Link: https://lkml.kernel.org/r/YJqWESqyxa8OZA+2@t490s [akpm@linux-foundation.org: coding style fixes] [peterx@redhat.com: fix build for task_mmu.c, introduce mm_set_has_pinned_flag, fix comments] Link: https://lkml.kernel.org/r/20210507150553.208763-4-peterx@redhat.com Signed-off-by: Andrea Arcangeli Signed-off-by: Peter Xu Reviewed-by: John Hubbard Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill Shutemov Cc: Kirill Tkhai Cc: Matthew Wilcox Cc: Michal Hocko Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 2 +- include/linux/mm_types.h | 10 ---------- include/linux/sched/coredump.h | 8 ++++++++ kernel/fork.c | 1 - mm/gup.c | 19 +++++++++++++++---- 6 files changed, 25 insertions(+), 17 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fc9784544b24..66965ad88d8b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1047,7 +1047,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, return false; if (!is_cow_mapping(vma->vm_flags)) return false; - if (likely(!atomic_read(&vma->vm_mm->has_pinned))) + if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) return false; page = vm_normal_page(vma, addr, pte); if (!page) diff --git a/include/linux/mm.h b/include/linux/mm.h index e39ed497578b..79f32962d7ae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1341,7 +1341,7 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, if (!is_cow_mapping(vma->vm_flags)) return false; - if (!atomic_read(&vma->vm_mm->has_pinned)) + if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; return page_maybe_dma_pinned(page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8f0fb62e8975..b66d0225414e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -435,16 +435,6 @@ struct mm_struct { */ atomic_t mm_count; - /** - * @has_pinned: Whether this mm has pinned any pages. This can - * be either replaced in the future by @pinned_vm when it - * becomes stable, or grow into a counter on its own. We're - * aggresive on this bit now - even if the pinned pages were - * unpinned later on, we'll still keep this bit set for the - * lifecycle of this mm just for simplicity. - */ - atomic_t has_pinned; - #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index dfd82eab2902..4d9e3a656875 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -73,6 +73,14 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 27 /* mm is shared between processes */ +/* + * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either + * replaced in the future by mm.pinned_vm when it becomes stable, or grow into + * a counter on its own. We're aggresive on this bit for now: even if the + * pinned pages were unpinned later on, we'll still keep this bit set for the + * lifecycle of this mm, just for simplicity. + */ +#define MMF_HAS_PINNED 28 /* FOLL_PIN has run, never cleared */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/kernel/fork.c b/kernel/fork.c index a070caed5c8e..c6747d556ef9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1029,7 +1029,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; - atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); diff --git a/mm/gup.c b/mm/gup.c index a6c20a7b3c49..8651309f8ec3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -420,6 +420,17 @@ void unpin_user_pages(struct page **pages, unsigned long npages) } EXPORT_SYMBOL(unpin_user_pages); +/* + * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's + * lifecycle. Avoid setting the bit unless necessary, or it might cause write + * cache bouncing on large SMP machines for concurrent pinned gups. + */ +static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) +{ + if (!test_bit(MMF_HAS_PINNED, mm_flags)) + set_bit(MMF_HAS_PINNED, mm_flags); +} + #ifdef CONFIG_MMU static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) @@ -1320,8 +1331,8 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, BUG_ON(*locked != 1); } - if ((flags & FOLL_PIN) && !atomic_read(&mm->has_pinned)) - atomic_set(&mm->has_pinned, 1); + if (flags & FOLL_PIN) + mm_set_has_pinned_flag(&mm->flags); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior @@ -2641,8 +2652,8 @@ static int internal_get_user_pages_fast(unsigned long start, FOLL_FAST_ONLY))) return -EINVAL; - if ((gup_flags & FOLL_PIN) && !atomic_read(¤t->mm->has_pinned)) - atomic_set(¤t->mm->has_pinned, 1); + if (gup_flags & FOLL_PIN) + mm_set_has_pinned_flag(¤t->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); -- cgit From e17eae2b839937817d771e2f5d2b30e5e2b81bb7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 28 Jun 2021 19:36:43 -0700 Subject: mm: pagewalk: fix walk for hugepage tables Pagewalk ignores hugepd entries and walk down the tables as if it was traditionnal entries, leading to crazy result. Add walk_hugepd_range() and use it to walk hugepage tables. Link: https://lkml.kernel.org/r/38d04410700c8d02f28ba37e020b62c55d6f3d2c.1624597695.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Steven Price Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Daniel Axtens Cc: "Oliver O'Halloran" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index e81640d9f177..9b3db11a4d1d 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -58,6 +58,45 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } +#ifdef CONFIG_ARCH_HAS_HUGEPD +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr, + unsigned long end, struct mm_walk *walk, int pdshift) +{ + int err = 0; + const struct mm_walk_ops *ops = walk->ops; + int shift = hugepd_shift(*phpd); + int page_size = 1 << shift; + + if (!ops->pte_entry) + return 0; + + if (addr & (page_size - 1)) + return 0; + + for (;;) { + pte_t *pte; + + spin_lock(&walk->mm->page_table_lock); + pte = hugepte_offset(*phpd, addr, pdshift); + err = ops->pte_entry(pte, addr, addr + page_size, walk); + spin_unlock(&walk->mm->page_table_lock); + + if (err) + break; + if (addr >= end - page_size) + break; + addr += page_size; + } + return err; +} +#else +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr, + unsigned long end, struct mm_walk *walk, int pdshift) +{ + return 0; +} +#endif + static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -108,7 +147,10 @@ again: goto again; } - err = walk_pte_range(pmd, addr, next, walk); + if (is_hugepd(__hugepd(pmd_val(*pmd)))) + err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT); + else + err = walk_pte_range(pmd, addr, next, walk); if (err) break; } while (pmd++, addr = next, addr != end); @@ -157,7 +199,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_none(*pud)) goto again; - err = walk_pmd_range(pud, addr, next, walk); + if (is_hugepd(__hugepd(pud_val(*pud)))) + err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT); + else + err = walk_pmd_range(pud, addr, next, walk); if (err) break; } while (pud++, addr = next, addr != end); @@ -189,7 +234,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, if (err) break; } - if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) + if (is_hugepd(__hugepd(p4d_val(*p4d)))) + err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT); + else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) err = walk_pud_range(p4d, addr, next, walk); if (err) break; @@ -224,8 +271,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, if (err) break; } - if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || - ops->pte_entry) + if (is_hugepd(__hugepd(pgd_val(*pgd)))) + err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT); + else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry) err = walk_p4d_range(pgd, addr, next, walk); if (err) break; -- cgit From 63d8620ecf93b5d8d0a254471184d08f8e8f538d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 28 Jun 2021 19:36:46 -0700 Subject: mm/swapfile: use percpu_ref to serialize against concurrent swapoff Patch series "close various race windows for swap", v6. When I was investigating the swap code, I found some possible race windows. This series aims to fix all these races. But using current get/put_swap_device() to guard against concurrent swapoff for swap_readpage() looks terrible because swap_readpage() may take really long time. And to reduce the performance overhead on the hot-path as much as possible, it appears we can use the percpu_ref to close this race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref support for swap and most of the remaining patches try to use this to close various race windows. More details can be found in the respective changelogs. This patch (of 4): Using current get/put_swap_device() to guard against concurrent swapoff for some swap ops, e.g. swap_readpage(), looks terrible because they might take really long time. This patch adds the percpu_ref support to serialize against concurrent swapoff(as suggested by Huang, Ying). Also we remove the SWP_VALID flag because it's used together with RCU solution. Link: https://lkml.kernel.org/r/20210426123316.806267-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210426123316.806267-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: "Huang, Ying" Cc: Alex Shi Cc: David Hildenbrand Cc: Dennis Zhou Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Wei Yang Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 ++-- mm/swapfile.c | 79 ++++++++++++++++++++++++++++++++-------------------- 2 files changed, 52 insertions(+), 32 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 144727041e78..c9e7fea10b83 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -177,7 +177,6 @@ enum { SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ - SWP_VALID = (1 << 13), /* swap is valid to be operated on? */ /* add others here before... */ SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; @@ -240,6 +239,7 @@ struct swap_cluster_list { * The in-memory structure used to track swap areas. */ struct swap_info_struct { + struct percpu_ref users; /* indicate and keep swap device valid. */ unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ @@ -260,6 +260,7 @@ struct swap_info_struct { struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ + struct completion comp; /* seldom referenced */ #ifdef CONFIG_FRONTSWAP unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ atomic_t frontswap_pages; /* frontswap pages in-use counter */ @@ -511,7 +512,7 @@ sector_t swap_page_sector(struct page *page); static inline void put_swap_device(struct swap_info_struct *si) { - rcu_read_unlock(); + percpu_ref_put(&si->users); } #else /* CONFIG_SWAP */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 996afa8131c8..a9a04a5360d9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work) spin_unlock(&si->lock); } +static void swap_users_ref_free(struct percpu_ref *ref) +{ + struct swap_info_struct *si; + + si = container_of(ref, struct swap_info_struct, users); + complete(&si->comp); +} + static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) { struct swap_cluster_info *ci = si->cluster_info; @@ -1270,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, * via preventing the swap device from being swapoff, until * put_swap_device() is called. Otherwise return NULL. * - * The entirety of the RCU read critical section must come before the - * return from or after the call to synchronize_rcu() in - * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is - * true, the si->map, si->cluster_info, etc. must be valid in the - * critical section. - * * Notice that swapoff or swapoff+swapon can still happen before the - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock() - * in put_swap_device() if there isn't any other way to prevent - * swapoff, such as page lock, page table lock, etc. The caller must - * be prepared for that. For example, the following situation is - * possible. + * percpu_ref_tryget_live() in get_swap_device() or after the + * percpu_ref_put() in put_swap_device() if there isn't any other way + * to prevent swapoff, such as page lock, page table lock, etc. The + * caller must be prepared for that. For example, the following + * situation is possible. * * CPU1 CPU2 * do_swap_page() @@ -1309,21 +1312,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) si = swp_swap_info(entry); if (!si) goto bad_nofile; - - rcu_read_lock(); - if (data_race(!(si->flags & SWP_VALID))) - goto unlock_out; + if (!percpu_ref_tryget_live(&si->users)) + goto out; + /* + * Guarantee the si->users are checked before accessing other + * fields of swap_info_struct. + * + * Paired with the spin_unlock() after setup_swap_info() in + * enable_swap_info(). + */ + smp_rmb(); offset = swp_offset(entry); if (offset >= si->max) - goto unlock_out; + goto put_out; return si; bad_nofile: pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; -unlock_out: - rcu_read_unlock(); +put_out: + percpu_ref_put(&si->users); return NULL; } @@ -2466,7 +2475,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio, static void _enable_swap_info(struct swap_info_struct *p) { - p->flags |= SWP_WRITEOK | SWP_VALID; + p->flags |= SWP_WRITEOK; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -2497,10 +2506,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, spin_unlock(&p->lock); spin_unlock(&swap_lock); /* - * Guarantee swap_map, cluster_info, etc. fields are valid - * between get/put_swap_device() if SWP_VALID bit is set + * Finished initializing swap device, now it's safe to reference it. */ - synchronize_rcu(); + percpu_ref_resurrect(&p->users); spin_lock(&swap_lock); spin_lock(&p->lock); _enable_swap_info(p); @@ -2616,16 +2624,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) reenable_swap_slots_cache_unlock(); - spin_lock(&swap_lock); - spin_lock(&p->lock); - p->flags &= ~SWP_VALID; /* mark swap device as invalid */ - spin_unlock(&p->lock); - spin_unlock(&swap_lock); /* - * wait for swap operations protected by get/put_swap_device() - * to complete + * Wait for swap operations protected by get/put_swap_device() + * to complete. + * + * We need synchronize_rcu() here to protect the accessing to + * the swap cache data structure. */ + percpu_ref_kill(&p->users); synchronize_rcu(); + wait_for_completion(&p->comp); flush_work(&p->discard_work); @@ -2857,6 +2865,12 @@ static struct swap_info_struct *alloc_swap_info(void) if (!p) return ERR_PTR(-ENOMEM); + if (percpu_ref_init(&p->users, swap_users_ref_free, + PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { + kvfree(p); + return ERR_PTR(-ENOMEM); + } + spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) @@ -2864,6 +2878,7 @@ static struct swap_info_struct *alloc_swap_info(void) } if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); + percpu_ref_exit(&p->users); kvfree(p); return ERR_PTR(-EPERM); } @@ -2891,9 +2906,13 @@ static struct swap_info_struct *alloc_swap_info(void) plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); - kvfree(defer); + if (defer) { + percpu_ref_exit(&defer->users); + kvfree(defer); + } spin_lock_init(&p->lock); spin_lock_init(&p->cont_lock); + init_completion(&p->comp); return p; } -- cgit From 2799e77529c2a25492a4395db93996e3dacd762d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 28 Jun 2021 19:36:50 -0700 Subject: swap: fix do_swap_page() race with swapoff When I was investigating the swap code, I found the below possible race window: CPU 1 CPU 2 ----- ----- do_swap_page if (data_race(si->flags & SWP_SYNCHRONOUS_IO) swap_readpage if (data_race(sis->flags & SWP_FS_OPS)) { swapoff .. p->swap_file = NULL; .. struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping;[oops!] Note that for the pages that are swapped in through swap cache, this isn't an issue. Because the page is locked, and the swap entry will be marked with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been unlocked. Fix this race by using get/put_swap_device() to guard against concurrent swapoff. Link: https://lkml.kernel.org/r/20210426123316.806267-3-linmiaohe@huawei.com Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device") Signed-off-by: Miaohe Lin Reviewed-by: "Huang, Ying" Cc: Alex Shi Cc: David Hildenbrand Cc: Dennis Zhou Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Wei Yang Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 9 +++++++++ mm/memory.c | 11 +++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index c9e7fea10b83..46d51d058d05 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -527,6 +527,15 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) return NULL; } +static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + return NULL; +} + +static inline void put_swap_device(struct swap_info_struct *si) +{ +} + #define swap_address_space(entry) (NULL) #define get_nr_swap_pages() 0L #define total_swap_pages 0L diff --git a/mm/memory.c b/mm/memory.c index 486f4a2874e7..b15367c285bd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3353,6 +3353,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; + struct swap_info_struct *si = NULL; swp_entry_t entry; pte_t pte; int locked; @@ -3380,14 +3381,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; } + /* Prevent swapoff from happening to us. */ + si = get_swap_device(entry); + if (unlikely(!si)) + goto out; delayacct_set_flag(current, DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; if (!page) { - struct swap_info_struct *si = swp_swap_info(entry); - if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ @@ -3556,6 +3559,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); out: + if (si) + put_swap_device(si); return ret; out_nomap: pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -3567,6 +3572,8 @@ out_release: unlock_page(swapcache); put_page(swapcache); } + if (si) + put_swap_device(si); return ret; } -- cgit From 5c046235a826370d528a29c44e0ce05f9685d8b4 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 28 Jun 2021 19:36:53 -0700 Subject: mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info() The non_swap_entry() was used for working with VMA based swap readahead via commit ec560175c0b6 ("mm, swap: VMA based swap readahead"). At that time, the non_swap_entry() checking is necessary because the function is called before checking that in do_swap_page(). Then it's moved to swap_ra_info() since commit eaf649ebc3ac ("mm: swap: clean up swap readahead"). After that, the non_swap_entry() checking is unnecessary, because swap_ra_info() is called after non_swap_entry() has been checked already. The resulting code is confusing as the non_swap_entry() check looks racy now because while we released the pte lock, somebody else might have faulted in this pte. So we should check whether it's swap pte first to guard against such race or swap_type will be unexpected. But the race isn't important because it will not cause problem. We would have enough checking when we really operate the PTE entries later. So we remove the non_swap_entry() check here to avoid confusion. Link: https://lkml.kernel.org/r/20210426123316.806267-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: "Huang, Ying" Cc: Alex Shi Cc: David Hildenbrand Cc: Dennis Zhou Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Wei Yang Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 272ea2108c9d..df5405384520 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf, { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; - swp_entry_t entry; unsigned long faddr, pfn, fpfn; unsigned long start, end; pte_t *pte, *orig_pte; @@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf, faddr = vmf->address; orig_pte = pte = pte_offset_map(vmf->pmd, faddr); - entry = pte_to_swp_entry(*pte); - if ((unlikely(non_swap_entry(entry)))) { - pte_unmap(orig_pte); - return; - } fpfn = PFN_DOWN(faddr); ra_val = GET_SWAP_RA_VAL(vma); -- cgit From 2efa33fc7f6ec94a3a538c1a264273c889be2b36 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 28 Jun 2021 19:36:57 -0700 Subject: mm/shmem: fix shmem_swapin() race with swapoff When I was investigating the swap code, I found the below possible race window: CPU 1 CPU 2 ----- ----- shmem_swapin swap_cluster_readahead if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) { swapoff .. si->swap_file = NULL; .. struct inode *inode = si->swap_file->f_mapping->host;[oops!] Close this race window by using get/put_swap_device() to guard against concurrent swapoff. Link: https://lkml.kernel.org/r/20210426123316.806267-5-linmiaohe@huawei.com Fixes: 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or not") Signed-off-by: Miaohe Lin Reviewed-by: "Huang, Ying" Cc: Dennis Zhou Cc: Tim Chen Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Cc: Joonsoo Kim Cc: Alex Shi Cc: Matthew Wilcox Cc: Minchan Kim Cc: Wei Yang Cc: Yang Shi Cc: David Hildenbrand Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 5d46611cba8d..53f21016608e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1696,7 +1696,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; - struct page *page; + struct swap_info_struct *si; + struct page *page = NULL; swp_entry_t swap; int error; @@ -1704,6 +1705,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, swap = radix_to_swp_entry(*pagep); *pagep = NULL; + /* Prevent swapoff from happening to us. */ + si = get_swap_device(swap); + if (!si) { + error = EINVAL; + goto failed; + } /* Look it up and read it in.. */ page = lookup_swap_cache(swap, NULL, 0); if (!page) { @@ -1765,6 +1772,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, swap_free(swap); *pagep = page; + if (si) + put_swap_device(si); return 0; failed: if (!shmem_confirm_swap(mapping, index, swap)) @@ -1775,6 +1784,9 @@ unlock: put_page(page); } + if (si) + put_swap_device(si); + return error; } -- cgit From bb243f7dc62429343404b052e9c51d745e618346 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 28 Jun 2021 19:37:00 -0700 Subject: mm/swapfile: move get_swap_page_of_type() under CONFIG_HIBERNATION Patch series "Cleanups for swap", v2. This series contains just cleanups to remove some unused variables, delete meaningless forward declarations and so on. More details can be found in the respective changelogs. This patch (of 4): We should move get_swap_page_of_type() under CONFIG_HIBERNATION since the only caller of this function is now suspend routine. [linmiaohe@huawei.com: move scan_swap_map() under CONFIG_HIBERNATION] Link: https://lkml.kernel.org/r/20210521070855.2015094-1-linmiaohe@huawei.com [linmiaohe@huawei.com: fold scan_swap_map() into the only caller get_swap_page_of_type()] Link: https://lkml.kernel.org/r/20210527120328.3935132-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210520134022.1370406-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210520134022.1370406-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 83 ++++++++++++++++++++++------------------------------------- 1 file changed, 31 insertions(+), 52 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index a9a04a5360d9..115f0b0c0c10 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -453,10 +453,10 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, unsigned int idx) { /* - * If scan_swap_map() can't find a free cluster, it will check + * If scan_swap_map_slots() can't find a free cluster, it will check * si->swap_map directly. To make sure the discarding cluster isn't - * taken by scan_swap_map(), mark the swap entries bad (occupied). It - * will be cleared after discard + * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). + * It will be cleared after discard */ memset(si->swap_map + idx * SWAPFILE_CLUSTER, SWAP_MAP_BAD, SWAPFILE_CLUSTER); @@ -589,7 +589,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p, } /* - * It's possible scan_swap_map() uses a free cluster in the middle of free + * It's possible scan_swap_map_slots() uses a free cluster in the middle of free * cluster list. Avoiding such abuse to avoid list corruption. */ static bool @@ -1037,21 +1037,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) swap_range_free(si, offset, SWAPFILE_CLUSTER); } -static unsigned long scan_swap_map(struct swap_info_struct *si, - unsigned char usage) -{ - swp_entry_t entry; - int n_ret; - - n_ret = scan_swap_map_slots(si, usage, 1, &entry); - - if (n_ret) - return swp_offset(entry); - else - return 0; - -} - int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) { unsigned long size = swap_entry_size(entry_size); @@ -1114,14 +1099,14 @@ start_over: nextsi: /* * if we got here, it's likely that si was almost full before, - * and since scan_swap_map() can drop the si->lock, multiple - * callers probably all tried to get a page from the same si - * and it filled up before we could get one; or, the si filled - * up between us dropping swap_avail_lock and taking si->lock. - * Since we dropped the swap_avail_lock, the swap_avail_head - * list may have been modified; so if next is still in the - * swap_avail_head list then try it, otherwise start over - * if we have not gotten any slots. + * and since scan_swap_map_slots() can drop the si->lock, + * multiple callers probably all tried to get a page from the + * same si and it filled up before we could get one; or, the si + * filled up between us dropping swap_avail_lock and taking + * si->lock. Since we dropped the swap_avail_lock, the + * swap_avail_head list may have been modified; so if next is + * still in the swap_avail_head list then try it, otherwise + * start over if we have not gotten any slots. */ if (plist_node_empty(&next->avail_lists[node])) goto start_over; @@ -1137,30 +1122,6 @@ noswap: return n_ret; } -/* The only caller of this function is now suspend routine */ -swp_entry_t get_swap_page_of_type(int type) -{ - struct swap_info_struct *si = swap_type_to_swap_info(type); - pgoff_t offset; - - if (!si) - goto fail; - - spin_lock(&si->lock); - if (si->flags & SWP_WRITEOK) { - /* This is called for allocating swap entry, not cache */ - offset = scan_swap_map(si, 1); - if (offset) { - atomic_long_dec(&nr_swap_pages); - spin_unlock(&si->lock); - return swp_entry(type, offset); - } - } - spin_unlock(&si->lock); -fail: - return (swp_entry_t) {0}; -} - static struct swap_info_struct *__swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; @@ -1812,6 +1773,24 @@ int free_swap_and_cache(swp_entry_t entry) } #ifdef CONFIG_HIBERNATION + +swp_entry_t get_swap_page_of_type(int type) +{ + struct swap_info_struct *si = swap_type_to_swap_info(type); + swp_entry_t entry = {0}; + + if (!si) + goto fail; + + /* This is called for allocating swap entry, not cache */ + spin_lock(&si->lock); + if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry)) + atomic_long_dec(&nr_swap_pages); + spin_unlock(&si->lock); +fail: + return entry; +} + /* * Find the swap type that corresponds to given device (if any). * @@ -2649,7 +2628,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_lock(&p->lock); drain_mmlist(); - /* wait for anyone still in scan_swap_map */ + /* wait for anyone still in scan_swap_map_slots */ p->highest_bit = 0; /* cuts scans short */ while (p->flags >= SWP_SCANNING) { spin_unlock(&p->lock); -- cgit From eb7709c5f3e55e230b9c8d8e79aa261b316066c2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 28 Jun 2021 19:37:03 -0700 Subject: mm/swap: remove unused local variable nr_shadows Since commit 55c653b71e8c ("mm: stop accounting shadow entries"), nr_shadows is not used anymore. Link: https://lkml.kernel.org/r/20210520134022.1370406-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index df5405384520..1a2ba4056f37 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -114,8 +114,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, SetPageSwapCache(page); do { - unsigned long nr_shadows = 0; - xas_lock_irq(&xas); xas_create_range(&xas); if (xas_error(&xas)) @@ -124,7 +122,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); old = xas_load(&xas); if (xa_is_value(old)) { - nr_shadows++; if (shadowp) *shadowp = old; } @@ -260,7 +257,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, void *old; for (;;) { - unsigned long nr_shadows = 0; swp_entry_t entry = swp_entry(type, curr); struct address_space *address_space = swap_address_space(entry); XA_STATE(xas, &address_space->i_pages, curr); @@ -270,7 +266,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, if (!xa_is_value(old)) continue; xas_store(&xas, NULL); - nr_shadows++; } xa_unlock_irq(&address_space->i_pages); -- cgit From 1cfcc8306a767bda9a8fe6fddb3e80ca9ab7656b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 28 Jun 2021 19:37:06 -0700 Subject: mm/swap_slots.c: delete meaningless forward declarations deactivate_swap_slots_cache() and reactivate_swap_slots_cache() are only called below their implementations. So these forward declarations are meaningless and should be removed. Link: https://lkml.kernel.org/r/20210520134022.1370406-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_slots.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 6248d1030a9b..a66f3e0ec973 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -43,8 +43,6 @@ static DEFINE_MUTEX(swap_slots_cache_mutex); static DEFINE_MUTEX(swap_slots_cache_enable_mutex); static void __drain_swap_slots_cache(unsigned int type); -static void deactivate_swap_slots_cache(void); -static void reactivate_swap_slots_cache(void); #define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled) #define SLOTS_CACHE 0x1 -- cgit From a4b451143fa275a31f17a93adac3b8dbb3d20ca2 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 28 Jun 2021 19:37:09 -0700 Subject: mm, swap: remove unnecessary smp_rmb() in swap_type_to_swap_info() Before commit c10d38cc8d3e ("mm, swap: bounds check swap_info array accesses to avoid NULL derefs"), the typical code to reference the swap_info[] is as follows, type = swp_type(swp_entry); if (type >= nr_swapfiles) /* handle invalid swp_entry */; p = swap_info[type]; /* access fields of *p. OOPS! p may be NULL! */ Because the ordering isn't guaranteed, it's possible that swap_info[type] is read before "nr_swapfiles". And that may result in NULL pointer dereference. So after commit c10d38cc8d3e, the code becomes, struct swap_info_struct *swap_type_to_swap_info(int type) { if (type >= READ_ONCE(nr_swapfiles)) return NULL; smp_rmb(); return READ_ONCE(swap_info[type]); } /* users */ type = swp_type(swp_entry); p = swap_type_to_swap_info(type); if (!p) /* handle invalid swp_entry */; /* dereference p */ Where the value of swap_info[type] (that is, "p") is checked to be non-zero before being dereferenced. So, the NULL deferencing becomes impossible even if "nr_swapfiles" is read after swap_info[type]. Therefore, the "smp_rmb()" becomes unnecessary. And, we don't even need to read "nr_swapfiles" here. Because the non-zero checking for "p" is sufficient. We just need to make sure we will not access out of the boundary of the array. With the change, nr_swapfiles will only be accessed with swap_lock held, except in swapcache_free_entries(). Where the absolute correctness of the value isn't needed, as described in the comments. We still need to guarantee swap_info[type] is read before being dereferenced. That can be satisfied via the data dependency ordering enforced by READ_ONCE(swap_info[type]). This needs to be paired with proper write barriers. So smp_store_release() is used in alloc_swap_info() to guarantee the fields of *swap_info[type] is initialized before swap_info[type] itself being written. Note that the fields of *swap_info[type] is initialized to be 0 via kvzalloc() firstly. The assignment and deferencing of swap_info[type] is like rcu_assign_pointer() and rcu_dereference(). Link: https://lkml.kernel.org/r/20210520073301.1676294-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Daniel Jordan Cc: Dan Carpenter Cc: Andrea Parri Cc: Peter Zijlstra (Intel) Cc: Andi Kleen Cc: Dave Hansen Cc: Omar Sandoval Cc: Paul McKenney Cc: Tejun Heo Cc: Will Deacon Cc: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 115f0b0c0c10..e898c879a434 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -100,11 +100,10 @@ atomic_t nr_rotate_swap = ATOMIC_INIT(0); static struct swap_info_struct *swap_type_to_swap_info(int type) { - if (type >= READ_ONCE(nr_swapfiles)) + if (type >= MAX_SWAPFILES) return NULL; - smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ - return READ_ONCE(swap_info[type]); + return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } static inline unsigned char swap_count(unsigned char ent) @@ -2863,14 +2862,12 @@ static struct swap_info_struct *alloc_swap_info(void) } if (type >= nr_swapfiles) { p->type = type; - WRITE_ONCE(swap_info[type], p); /* - * Write swap_info[type] before nr_swapfiles, in case a - * racing procfs swap_start() or swap_next() is reading them. - * (We never shrink nr_swapfiles, we never free this entry.) + * Publish the swap_info_struct after initializing it. + * Note that kvzalloc() above zeroes all its fields. */ - smp_wmb(); - WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); + smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ + nr_swapfiles++; } else { defer = p; p = swap_info[type]; -- cgit From f4c4a3f48480730214c4f02ffa480f6bf5b0718f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 28 Jun 2021 19:37:12 -0700 Subject: mm: free idle swap cache page after COW With commit 09854ba94c6a ("mm: do_wp_page() simplification"), after COW, the idle swap cache page (neither the page nor the corresponding swap entry is mapped by any process) will be left in the LRU list, even if it's in the active list or the head of the inactive list. So, the page reclaimer may take quite some overhead to reclaim these actually unused pages. To help the page reclaiming, in this patch, after COW, the idle swap cache page will be tried to be freed. To avoid to introduce much overhead to the hot COW code path, a) there's almost zero overhead for non-swap case via checking PageSwapCache() firstly. b) the page lock is acquired via trylock only. To test the patch, we used pmbench memory accessing benchmark with working-set larger than available memory on a 2-socket Intel server with a NVMe SSD as swap device. Test results shows that the pmbench score increases up to 23.8% with the decreased size of swap cache and swapin throughput. Link: https://lkml.kernel.org/r/20210601053143.1380078-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Johannes Weiner [use free_swap_cache()] Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Peter Xu Cc: Mel Gorman Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Dave Hansen Cc: Tim Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 5 +++++ mm/memory.c | 2 ++ mm/swap_state.c | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 46d51d058d05..49b1dd2c100b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -446,6 +446,7 @@ extern void __delete_from_swap_cache(struct page *page, extern void delete_from_swap_cache(struct page *); extern void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); +extern void free_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); extern struct page *lookup_swap_cache(swp_entry_t entry, @@ -551,6 +552,10 @@ static inline void put_swap_device(struct swap_info_struct *si) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); +static inline void free_swap_cache(struct page *page) +{ +} + static inline void show_swap_cache_info(void) { } diff --git a/mm/memory.c b/mm/memory.c index b15367c285bd..a4d82a6de000 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3023,6 +3023,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) munlock_vma_page(old_page); unlock_page(old_page); } + if (page_copied) + free_swap_cache(old_page); put_page(old_page); } return page_copied ? VM_FAULT_WRITE : 0; diff --git a/mm/swap_state.c b/mm/swap_state.c index 1a2ba4056f37..4f8a912ff692 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -286,7 +286,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, * try_to_free_swap() _with_ the lock. * - Marcelo */ -static inline void free_swap_cache(struct page *page) +void free_swap_cache(struct page *page) { if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { try_to_free_swap(page); -- cgit From eea4a5011ae520c98d0a14474ecde44f29659861 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 28 Jun 2021 19:37:16 -0700 Subject: swap: check mapping_empty() for swap cache before being freed To check whether all pages and shadow entries in swap cache has been removed before swap cache is freed. Link: https://lkml.kernel.org/r/20210608005121.511140-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Miaohe Lin Cc: Matthew Wilcox Cc: Minchan Kim Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Hugh Dickins Cc: Mel Gorman Cc: Michal Hocko Cc: Dan Williams Cc: Christoph Hellwig Cc: Ilya Dryomov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 4f8a912ff692..c56aa9ac050d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -693,7 +693,12 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) void exit_swap_address_space(unsigned int type) { - kvfree(swapper_spaces[type]); + int i; + struct address_space *spaces = swapper_spaces[type]; + + for (i = 0; i < nr_swapper_spaces[type]; i++) + VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); + kvfree(spaces); nr_swapper_spaces[type] = 0; swapper_spaces[type] = NULL; } -- cgit From fdbcb2a6d6778e0b91938529694e5f40b4a66130 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 28 Jun 2021 19:37:19 -0700 Subject: mm/memcg: move mod_objcg_state() to memcontrol.c Patch series "mm/memcg: Reduce kmemcache memory accounting overhead", v6. With the recent introduction of the new slab memory controller, we eliminate the need for having separate kmemcaches for each memory cgroup and reduce overall kernel memory usage. However, we also add additional memory accounting overhead to each call of kmem_cache_alloc() and kmem_cache_free(). For workloads that require a lot of kmemcache allocations and de-allocations, they may experience performance regression as illustrated in [1] and [2]. A simple kernel module that performs repeated loop of 100,000,000 kmem_cache_alloc() and kmem_cache_free() of either a small 32-byte object or a big 4k object at module init time with a batch size of 4 (4 kmalloc's followed by 4 kfree's) is used for benchmarking. The benchmarking tool was run on a kernel based on linux-next-20210419. The test was run on a CascadeLake server with turbo-boosting disable to reduce run-to-run variation. The small object test exercises mainly the object stock charging and vmstat update code paths. The large object test also exercises the refill_obj_stock() and __memcg_kmem_charge()/__memcg_kmem_uncharge() code paths. With memory accounting disabled, the run time was 3.130s with both small object big object tests. With memory accounting enabled, both cgroup v1 and v2 showed similar results in the small object test. The performance results of the large object test, however, differed between cgroup v1 and v2. The execution times with the application of various patches in the patchset were: Applied patches Run time Accounting overhead %age 1 %age 2 --------------- -------- ------------------- ------ ------ Small 32-byte object: None 11.634s 8.504s 100.0% 271.7% 1-2 9.425s 6.295s 74.0% 201.1% 1-3 9.708s 6.578s 77.4% 210.2% 1-4 8.062s 4.932s 58.0% 157.6% Large 4k object (v2): None 22.107s 18.977s 100.0% 606.3% 1-2 20.960s 17.830s 94.0% 569.6% 1-3 14.238s 11.108s 58.5% 354.9% 1-4 11.329s 8.199s 43.2% 261.9% Large 4k object (v1): None 36.807s 33.677s 100.0% 1075.9% 1-2 36.648s 33.518s 99.5% 1070.9% 1-3 22.345s 19.215s 57.1% 613.9% 1-4 18.662s 15.532s 46.1% 496.2% N.B. %age 1 = overhead/unpatched overhead %age 2 = overhead/accounting disabled time Patch 2 (vmstat data stock caching) helps in both the small object test and the large v2 object test. It doesn't help much in v1 big object test. Patch 3 (refill_obj_stock improvement) does help the small object test but offer significant performance improvement for the large object test (both v1 and v2). Patch 4 (eliminating irq disable/enable) helps in all test cases. To test for the extreme case, a multi-threaded kmalloc/kfree microbenchmark was run on the 2-socket 48-core 96-thread system with 96 testing threads in the same memcg doing kmalloc+kfree of a 4k object with accounting enabled for 10s. The total number of kmalloc+kfree done in kilo operations per second (kops/s) were as follows: Applied patches v1 kops/s v1 change v2 kops/s v2 change --------------- --------- --------- --------- --------- None 3,520 1.00X 6,242 1.00X 1-2 4,304 1.22X 8,478 1.36X 1-3 4,731 1.34X 418,142 66.99X 1-4 4,587 1.30X 438,838 70.30X With memory accounting disabled, the kmalloc/kfree rate was 1,481,291 kop/s. This test shows how significant the memory accouting overhead can be in some extreme situations. For this multithreaded test, the improvement from patch 2 mainly comes from the conditional atomic xchg of objcg->nr_charged_bytes in mod_objcg_state(). By using an unconditional xchg, the operation rates were similar to the unpatched kernel. Patch 3 elminates the single highly contended cacheline of objcg->nr_charged_bytes for cgroup v2 leading to a huge performance improvement. Cgroup v1, however, still has another highly contended cacheline in the shared page counter &memcg->kmem. So the improvement is only modest. Patch 4 helps in cgroup v2, but performs worse in cgroup v1 as eliminating the irq_disable/irq_enable overhead seems to aggravate the cacheline contention. [1] https://lore.kernel.org/linux-mm/20210408193948.vfktg3azh2wrt56t@gabell/T/#u [2] https://lore.kernel.org/lkml/20210114025151.GA22932@xsang-OptiPlex-9020/ This patch (of 4): mod_objcg_state() is moved from mm/slab.h to mm/memcontrol.c so that further optimization can be done to it in later patches without exposing unnecessary details to other mm components. Link: https://lkml.kernel.org/r/20210506150007.16288-1-longman@redhat.com Link: https://lkml.kernel.org/r/20210506150007.16288-2-longman@redhat.com Signed-off-by: Waiman Long Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Alex Shi Cc: Chris Down Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Masayoshi Mizuma Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Pekka Enberg Cc: Tejun Heo Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xing Zhengjun Cc: Yafang Shao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 +++++++++++++ mm/slab.h | 16 ++-------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 64ada9e650a5..7cd7187a017c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -782,6 +782,19 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) rcu_read_unlock(); } +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + mod_memcg_lruvec_state(lruvec, idx, nr); + rcu_read_unlock(); +} + /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup diff --git a/mm/slab.h b/mm/slab.h index 7f9b4bd9fc65..f2c32f24da95 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -240,6 +240,8 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, gfp_t gfp, bool new_page); +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr); static inline void memcg_free_page_obj_cgroups(struct page *page) { @@ -284,20 +286,6 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, return true; } -static inline void mod_objcg_state(struct obj_cgroup *objcg, - struct pglist_data *pgdat, - enum node_stat_item idx, int nr) -{ - struct mem_cgroup *memcg; - struct lruvec *lruvec; - - rcu_read_lock(); - memcg = obj_cgroup_memcg(objcg); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - mod_memcg_lruvec_state(lruvec, idx, nr); - rcu_read_unlock(); -} - static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, gfp_t flags, size_t size, -- cgit From 68ac5b3c8db2fda00af594eca4100aceaf927c0e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 28 Jun 2021 19:37:23 -0700 Subject: mm/memcg: cache vmstat data in percpu memcg_stock_pcp Before the new slab memory controller with per object byte charging, charging and vmstat data update happen only when new slab pages are allocated or freed. Now they are done with every kmem_cache_alloc() and kmem_cache_free(). This causes additional overhead for workloads that generate a lot of alloc and free calls. The memcg_stock_pcp is used to cache byte charge for a specific obj_cgroup to reduce that overhead. To further reducing it, this patch makes the vmstat data cached in the memcg_stock_pcp structure as well until it accumulates a page size worth of update or when other cached data change. Caching the vmstat data in the per-cpu stock eliminates two writes to non-hot cachelines for memcg specific as well as memcg-lruvecs specific vmstat data by a write to a hot local stock cacheline. On a 2-socket Cascade Lake server with instrumentation enabled and this patch applied, it was found that about 20% (634400 out of 3243830) of the time when mod_objcg_state() is called leads to an actual call to __mod_objcg_state() after initial boot. When doing parallel kernel build, the figure was about 17% (24329265 out of 142512465). So caching the vmstat data reduces the number of calls to __mod_objcg_state() by more than 80%. Link: https://lkml.kernel.org/r/20210506150007.16288-3-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Shakeel Butt Cc: Alex Shi Cc: Chris Down Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Masayoshi Mizuma Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Pekka Enberg Cc: Roman Gushchin Cc: Tejun Heo Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xing Zhengjun Cc: Yafang Shao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7cd7187a017c..b4624580d18a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) rcu_read_unlock(); } -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr) +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) { struct mem_cgroup *memcg; struct lruvec *lruvec; @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); lruvec = mem_cgroup_lruvec(memcg, pgdat); - mod_memcg_lruvec_state(lruvec, idx, nr); + __mod_memcg_lruvec_state(lruvec, idx, nr); rcu_read_unlock(); } @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp { #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cached_objcg; + struct pglist_data *cached_pgdat; unsigned int nr_bytes; + int nr_slab_reclaimable_b; + int nr_slab_unreclaimable_b; #endif struct work_struct work; @@ -3008,6 +3012,67 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct memcg_stock_pcp *stock; + unsigned long flags; + int *bytes; + + local_irq_save(flags); + stock = this_cpu_ptr(&memcg_stock); + + /* + * Save vmstat data in stock and skip vmstat array update unless + * accumulating over a page of vmstat data or when pgdat or idx + * changes. + */ + if (stock->cached_objcg != objcg) { + drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + stock->cached_objcg = objcg; + stock->cached_pgdat = pgdat; + } else if (stock->cached_pgdat != pgdat) { + /* Flush the existing cached vmstat data */ + if (stock->nr_slab_reclaimable_b) { + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B, + stock->nr_slab_reclaimable_b); + stock->nr_slab_reclaimable_b = 0; + } + if (stock->nr_slab_unreclaimable_b) { + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B, + stock->nr_slab_unreclaimable_b); + stock->nr_slab_unreclaimable_b = 0; + } + stock->cached_pgdat = pgdat; + } + + bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b + : &stock->nr_slab_unreclaimable_b; + /* + * Even for large object >= PAGE_SIZE, the vmstat data will still be + * cached locally at least once before pushing it out. + */ + if (!*bytes) { + *bytes = nr; + nr = 0; + } else { + *bytes += nr; + if (abs(*bytes) > PAGE_SIZE) { + nr = *bytes; + *bytes = 0; + } else { + nr = 0; + } + } + if (nr) + mod_objcg_mlstate(objcg, pgdat, idx, nr); + + local_irq_restore(flags); +} + static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { struct memcg_stock_pcp *stock; @@ -3055,6 +3120,25 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) stock->nr_bytes = 0; } + /* + * Flush the vmstat data in current stock + */ + if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { + if (stock->nr_slab_reclaimable_b) { + mod_objcg_mlstate(old, stock->cached_pgdat, + NR_SLAB_RECLAIMABLE_B, + stock->nr_slab_reclaimable_b); + stock->nr_slab_reclaimable_b = 0; + } + if (stock->nr_slab_unreclaimable_b) { + mod_objcg_mlstate(old, stock->cached_pgdat, + NR_SLAB_UNRECLAIMABLE_B, + stock->nr_slab_unreclaimable_b); + stock->nr_slab_unreclaimable_b = 0; + } + stock->cached_pgdat = NULL; + } + obj_cgroup_put(old); stock->cached_objcg = NULL; } -- cgit From 5387c90490f7f42df3209154ca955a453ee01b41 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 28 Jun 2021 19:37:27 -0700 Subject: mm/memcg: improve refill_obj_stock() performance There are two issues with the current refill_obj_stock() code. First of all, when nr_bytes reaches over PAGE_SIZE, it calls drain_obj_stock() to atomically flush out remaining bytes to obj_cgroup, clear cached_objcg and do a obj_cgroup_put(). It is likely that the same obj_cgroup will be used again which leads to another call to drain_obj_stock() and obj_cgroup_get() as well as atomically retrieve the available byte from obj_cgroup. That is costly. Instead, we should just uncharge the excess pages, reduce the stock bytes and be done with it. The drain_obj_stock() function should only be called when obj_cgroup changes. Secondly, when charging an object of size not less than a page in obj_cgroup_charge(), it is possible that the remaining bytes to be refilled to the stock will overflow a page and cause refill_obj_stock() to uncharge 1 page. To avoid the additional uncharge in this case, a new allow_uncharge flag is added to refill_obj_stock() which will be set to false when called from obj_cgroup_charge() so that an uncharge_pages() call won't be issued right after a charge_pages() call unless the objcg changes. A multithreaded kmalloc+kfree microbenchmark on a 2-socket 48-core 96-thread x86-64 system with 96 testing threads were run. Before this patch, the total number of kilo kmalloc+kfree operations done for a 4k large object by all the testing threads per second were 4,304 kops/s (cgroup v1) and 8,478 kops/s (cgroup v2). After applying this patch, the number were 4,731 (cgroup v1) and 418,142 (cgroup v2) respectively. This represents a performance improvement of 1.10X (cgroup v1) and 49.3X (cgroup v2). Link: https://lkml.kernel.org/r/20210506150007.16288-4-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Shakeel Butt Cc: Alex Shi Cc: Chris Down Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Masayoshi Mizuma Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Pekka Enberg Cc: Roman Gushchin Cc: Tejun Heo Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Wei Yang Cc: Xing Zhengjun Cc: Yafang Shao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 48 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b4624580d18a..17d38c7f630f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3157,10 +3157,12 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, return false; } -static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + bool allow_uncharge) { struct memcg_stock_pcp *stock; unsigned long flags; + unsigned int nr_pages = 0; local_irq_save(flags); @@ -3169,14 +3171,21 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) drain_obj_stock(stock); obj_cgroup_get(objcg); stock->cached_objcg = objcg; - stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + allow_uncharge = true; /* Allow uncharge when objcg changes */ } stock->nr_bytes += nr_bytes; - if (stock->nr_bytes > PAGE_SIZE) - drain_obj_stock(stock); + if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { + nr_pages = stock->nr_bytes >> PAGE_SHIFT; + stock->nr_bytes &= (PAGE_SIZE - 1); + } local_irq_restore(flags); + + if (nr_pages) + obj_cgroup_uncharge_pages(objcg, nr_pages); } int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) @@ -3188,14 +3197,27 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) return 0; /* - * In theory, memcg->nr_charged_bytes can have enough + * In theory, objcg->nr_charged_bytes can have enough * pre-charged bytes to satisfy the allocation. However, - * flushing memcg->nr_charged_bytes requires two atomic - * operations, and memcg->nr_charged_bytes can't be big, - * so it's better to ignore it and try grab some new pages. - * memcg->nr_charged_bytes will be flushed in - * refill_obj_stock(), called from this function or - * independently later. + * flushing objcg->nr_charged_bytes requires two atomic + * operations, and objcg->nr_charged_bytes can't be big. + * The shared objcg->nr_charged_bytes can also become a + * performance bottleneck if all tasks of the same memcg are + * trying to update it. So it's better to ignore it and try + * grab some new pages. The stock's nr_bytes will be flushed to + * objcg->nr_charged_bytes later on when objcg changes. + * + * The stock's nr_bytes may contain enough pre-charged bytes + * to allow one less page from being charged, but we can't rely + * on the pre-charged bytes not being changed outside of + * consume_obj_stock() or refill_obj_stock(). So ignore those + * pre-charged bytes as well when charging pages. To avoid a + * page uncharge right after a page charge, we set the + * allow_uncharge flag to false when calling refill_obj_stock() + * to temporarily allow the pre-charged bytes to exceed the page + * size limit. The maximum reachable value of the pre-charged + * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data + * race. */ nr_pages = size >> PAGE_SHIFT; nr_bytes = size & (PAGE_SIZE - 1); @@ -3205,14 +3227,14 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); if (!ret && nr_bytes) - refill_obj_stock(objcg, PAGE_SIZE - nr_bytes); + refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); return ret; } void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) { - refill_obj_stock(objcg, size); + refill_obj_stock(objcg, size, true); } #endif /* CONFIG_MEMCG_KMEM */ -- cgit From 559271146efc0bf125e6390191f683eab884e4a1 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 28 Jun 2021 19:37:30 -0700 Subject: mm/memcg: optimize user context object stock access Most kmem_cache_alloc() calls are from user context. With instrumentation enabled, the measured amount of kmem_cache_alloc() calls from non-task context was about 0.01% of the total. The irq disable/enable sequence used in this case to access content from object stock is slow. To optimize for user context access, there are now two sets of object stocks (in the new obj_stock structure) for task context and interrupt context access respectively. The task context object stock can be accessed after disabling preemption which is cheap in non-preempt kernel. The interrupt context object stock can only be accessed after disabling interrupt. User context code can access interrupt object stock, but not vice versa. The downside of this change is that there are more data stored in local object stocks and not reflected in the charge counter and the vmstat arrays. However, this is a small price to pay for better performance. [longman@redhat.com: fix potential uninitialized variable warning] Link: https://lkml.kernel.org/r/20210526193602.8742-1-longman@redhat.com [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/20210506150007.16288-5-longman@redhat.com Signed-off-by: Waiman Long Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Roman Gushchin Cc: Muchun Song Cc: Alex Shi Cc: Chris Down Cc: Yafang Shao Cc: Wei Yang Cc: Masayoshi Mizuma Cc: Xing Zhengjun Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 100 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 28 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 17d38c7f630f..97f76ce04eae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -782,6 +782,10 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) rcu_read_unlock(); } +/* + * mod_objcg_mlstate() may be called with irq enabled, so + * mod_memcg_lruvec_state() should be used. + */ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) @@ -792,7 +796,7 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_memcg_lruvec_state(lruvec, idx, nr); + mod_memcg_lruvec_state(lruvec, idx, nr); rcu_read_unlock(); } @@ -2054,17 +2058,23 @@ void unlock_page_memcg(struct page *page) } EXPORT_SYMBOL(unlock_page_memcg); -struct memcg_stock_pcp { - struct mem_cgroup *cached; /* this never be root cgroup */ - unsigned int nr_pages; - +struct obj_stock { #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cached_objcg; struct pglist_data *cached_pgdat; unsigned int nr_bytes; int nr_slab_reclaimable_b; int nr_slab_unreclaimable_b; +#else + int dummy[0]; #endif +}; + +struct memcg_stock_pcp { + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; + struct obj_stock task_obj; + struct obj_stock irq_obj; struct work_struct work; unsigned long flags; @@ -2074,12 +2084,12 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static DEFINE_MUTEX(percpu_charge_mutex); #ifdef CONFIG_MEMCG_KMEM -static void drain_obj_stock(struct memcg_stock_pcp *stock); +static void drain_obj_stock(struct obj_stock *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); #else -static inline void drain_obj_stock(struct memcg_stock_pcp *stock) +static inline void drain_obj_stock(struct obj_stock *stock) { } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @@ -2089,6 +2099,41 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, } #endif +/* + * Most kmem_cache_alloc() calls are from user context. The irq disable/enable + * sequence used in this case to access content from object stock is slow. + * To optimize for user context access, there are now two object stocks for + * task context and interrupt context access respectively. + * + * The task context object stock can be accessed by disabling preemption only + * which is cheap in non-preempt kernel. The interrupt context object stock + * can only be accessed after disabling interrupt. User context code can + * access interrupt object stock, but not vice versa. + */ +static inline struct obj_stock *get_obj_stock(unsigned long *pflags) +{ + struct memcg_stock_pcp *stock; + + if (likely(in_task())) { + *pflags = 0UL; + preempt_disable(); + stock = this_cpu_ptr(&memcg_stock); + return &stock->task_obj; + } + + local_irq_save(*pflags); + stock = this_cpu_ptr(&memcg_stock); + return &stock->irq_obj; +} + +static inline void put_obj_stock(unsigned long flags) +{ + if (likely(in_task())) + preempt_enable(); + else + local_irq_restore(flags); +} + /** * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. @@ -2155,7 +2200,9 @@ static void drain_local_stock(struct work_struct *dummy) local_irq_save(flags); stock = this_cpu_ptr(&memcg_stock); - drain_obj_stock(stock); + drain_obj_stock(&stock->irq_obj); + if (in_task()) + drain_obj_stock(&stock->task_obj); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); @@ -3015,13 +3062,10 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { - struct memcg_stock_pcp *stock; unsigned long flags; + struct obj_stock *stock = get_obj_stock(&flags); int *bytes; - local_irq_save(flags); - stock = this_cpu_ptr(&memcg_stock); - /* * Save vmstat data in stock and skip vmstat array update unless * accumulating over a page of vmstat data or when pgdat or idx @@ -3070,29 +3114,26 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, if (nr) mod_objcg_mlstate(objcg, pgdat, idx, nr); - local_irq_restore(flags); + put_obj_stock(flags); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { - struct memcg_stock_pcp *stock; unsigned long flags; + struct obj_stock *stock = get_obj_stock(&flags); bool ret = false; - local_irq_save(flags); - - stock = this_cpu_ptr(&memcg_stock); if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; } - local_irq_restore(flags); + put_obj_stock(flags); return ret; } -static void drain_obj_stock(struct memcg_stock_pcp *stock) +static void drain_obj_stock(struct obj_stock *stock) { struct obj_cgroup *old = stock->cached_objcg; @@ -3148,8 +3189,13 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, { struct mem_cgroup *memcg; - if (stock->cached_objcg) { - memcg = obj_cgroup_memcg(stock->cached_objcg); + if (in_task() && stock->task_obj.cached_objcg) { + memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) + return true; + } + if (stock->irq_obj.cached_objcg) { + memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) return true; } @@ -3160,13 +3206,10 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { - struct memcg_stock_pcp *stock; unsigned long flags; + struct obj_stock *stock = get_obj_stock(&flags); unsigned int nr_pages = 0; - local_irq_save(flags); - - stock = this_cpu_ptr(&memcg_stock); if (stock->cached_objcg != objcg) { /* reset if necessary */ drain_obj_stock(stock); obj_cgroup_get(objcg); @@ -3182,7 +3225,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - local_irq_restore(flags); + put_obj_stock(flags); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); @@ -6790,6 +6833,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) unsigned long nr_pages; struct mem_cgroup *memcg; struct obj_cgroup *objcg; + bool use_objcg = PageMemcgKmem(page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -6798,7 +6842,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) * page memcg or objcg at this point, we have fully * exclusive access to the page. */ - if (PageMemcgKmem(page)) { + if (use_objcg) { objcg = __page_objcg(page); /* * This get matches the put at the end of the function and @@ -6826,7 +6870,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) nr_pages = compound_nr(page); - if (PageMemcgKmem(page)) { + if (use_objcg) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; -- cgit From 41eb5df1cbc9b302fc263ad7c9f38cfc38b4df61 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 28 Jun 2021 19:37:34 -0700 Subject: mm: memcg/slab: properly set up gfp flags for objcg pointer array Patch series "mm: memcg/slab: Fix objcg pointer array handling problem", v4. Since the merging of the new slab memory controller in v5.9, the page structure stores a pointer to objcg pointer array for slab pages. When the slab has no used objects, it can be freed in free_slab() which will call kfree() to free the objcg pointer array in memcg_alloc_page_obj_cgroups(). If it happens that the objcg pointer array is the last used object in its slab, that slab may then be freed which may caused kfree() to be called again. With the right workload, the slab cache may be set up in a way that allows the recursive kfree() calling loop to nest deep enough to cause a kernel stack overflow and panic the system. In fact, we have a reproducer that can cause kernel stack overflow on a s390 system involving kmalloc-rcl-256 and kmalloc-rcl-128 slabs with the following kfree() loop recursively called 74 times: [ 285.520739] [<000000000ec432fc>] kfree+0x4bc/0x560 [ 285.520740] [<000000000ec43466>] __free_slab+0xc6/0x228 [ 285.520741] [<000000000ec41fc2>] __slab_free+0x3c2/0x3e0 [ 285.520742] [<000000000ec432fc>] kfree+0x4bc/0x560 : While investigating this issue, I also found an issue on the allocation side. If the objcg pointer array happen to come from the same slab or a circular dependency linkage is formed with multiple slabs, those affected slabs can never be freed again. This patch series addresses these two issues by introducing a new set of kmalloc-cg- caches split from kmalloc- caches. The new set will only contain non-reclaimable and non-dma objects that are accounted in memory cgroups whereas the old set are now for unaccounted objects only. By making this split, all the objcg pointer arrays will come from the kmalloc- caches, but those caches will never hold any objcg pointer array. As a result, deeply nested kfree() call and the unfreeable slab problems are now gone. This patch (of 4): Since the merging of the new slab memory controller in v5.9, the page structure may store a pointer to obj_cgroup pointer array for slab pages. Currently, only the __GFP_ACCOUNT bit is masked off. However, the array is not readily reclaimable and doesn't need to come from the DMA buffer. So those GFP bits should be masked off as well. Do the flag bit clearing at memcg_alloc_page_obj_cgroups() to make sure that it is consistently applied no matter where it is called. Link: https://lkml.kernel.org/r/20210505200610.13943-1-longman@redhat.com Link: https://lkml.kernel.org/r/20210505200610.13943-2-longman@redhat.com Fixes: 286e04b8ed7a ("mm: memcg/slab: allocate obj_cgroups for non-root slab pages") Signed-off-by: Waiman Long Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++++++ mm/slab.h | 1 - 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 97f76ce04eae..2508bd97349c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2803,6 +2803,13 @@ retry: } #ifdef CONFIG_MEMCG_KMEM +/* + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. + */ +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) + int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, gfp_t gfp, bool new_page) { @@ -2810,6 +2817,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, unsigned long memcg_data; void *vec; + gfp &= ~OBJCGS_CLEAR_MASK; vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, page_to_nid(page)); if (!vec) diff --git a/mm/slab.h b/mm/slab.h index f2c32f24da95..7b60ef2f32c3 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -298,7 +298,6 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, if (!memcg_kmem_enabled() || !objcg) return; - flags &= ~__GFP_ACCOUNT; for (i = 0; i < size; i++) { if (likely(p[i])) { page = virt_to_head_page(p[i]); -- cgit From 494c1dfe855ec1f70f89552fce5eadf4a1717552 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 28 Jun 2021 19:37:38 -0700 Subject: mm: memcg/slab: create a new set of kmalloc-cg- caches There are currently two problems in the way the objcg pointer array (memcg_data) in the page structure is being allocated and freed. On its allocation, it is possible that the allocated objcg pointer array comes from the same slab that requires memory accounting. If this happens, the slab will never become empty again as there is at least one object left (the obj_cgroup array) in the slab. When it is freed, the objcg pointer array object may be the last one in its slab and hence causes kfree() to be called again. With the right workload, the slab cache may be set up in a way that allows the recursive kfree() calling loop to nest deep enough to cause a kernel stack overflow and panic the system. One way to solve this problem is to split the kmalloc- caches (KMALLOC_NORMAL) into two separate sets - a new set of kmalloc- (KMALLOC_NORMAL) caches for unaccounted objects only and a new set of kmalloc-cg- (KMALLOC_CGROUP) caches for accounted objects only. All the other caches can still allow a mix of accounted and unaccounted objects. With this change, all the objcg pointer array objects will come from KMALLOC_NORMAL caches which won't have their objcg pointer arrays. So both the recursive kfree() problem and non-freeable slab problem are gone. Since both the KMALLOC_NORMAL and KMALLOC_CGROUP caches no longer have mixed accounted and unaccounted objects, this will slightly reduce the number of objcg pointer arrays that need to be allocated and save a bit of memory. On the other hand, creating a new set of kmalloc caches does have the effect of reducing cache utilization. So it is properly a wash. The new KMALLOC_CGROUP is added between KMALLOC_NORMAL and KMALLOC_RECLAIM so that the first for loop in create_kmalloc_caches() will include the newly added caches without change. [vbabka@suse.cz: don't create kmalloc-cg caches with cgroup.memory=nokmem] Link: https://lkml.kernel.org/r/20210512145107.6208-1-longman@redhat.com [akpm@linux-foundation.org: un-fat-finger v5 delta creation] [longman@redhat.com: disable cache merging for KMALLOC_NORMAL caches] Link: https://lkml.kernel.org/r/20210505200610.13943-4-longman@redhat.com Link: https://lkml.kernel.org/r/20210512145107.6208-1-longman@redhat.com Link: https://lkml.kernel.org/r/20210505200610.13943-3-longman@redhat.com Signed-off-by: Waiman Long Signed-off-by: Vlastimil Babka Suggested-by: Vlastimil Babka Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: Pekka Enberg Cc: Vladimir Davydov [longman@redhat.com: fix for CONFIG_ZONE_DMA=n] Suggested-by: Roman Gushchin Reviewed-by: Vlastimil Babka Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 42 +++++++++++++++++++++++++++++++++--------- mm/internal.h | 5 +++++ mm/memcontrol.c | 2 +- mm/slab_common.c | 32 +++++++++++++++++++++++--------- 4 files changed, 62 insertions(+), 19 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index bc9ab3a5a017..083f3ce550bc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -305,9 +305,21 @@ static inline void __check_heap_object(const void *ptr, unsigned long n, /* * Whenever changing this, take care of that kmalloc_type() and * create_kmalloc_caches() still work as intended. + * + * KMALLOC_NORMAL can contain only unaccounted objects whereas KMALLOC_CGROUP + * is for accounted but unreclaimable and non-dma objects. All the other + * kmem caches can have both accounted and unaccounted objects. */ enum kmalloc_cache_type { KMALLOC_NORMAL = 0, +#ifndef CONFIG_ZONE_DMA + KMALLOC_DMA = KMALLOC_NORMAL, +#endif +#ifndef CONFIG_MEMCG_KMEM + KMALLOC_CGROUP = KMALLOC_NORMAL, +#else + KMALLOC_CGROUP, +#endif KMALLOC_RECLAIM, #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, @@ -319,24 +331,36 @@ enum kmalloc_cache_type { extern struct kmem_cache * kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; +/* + * Define gfp bits that should not be set for KMALLOC_NORMAL. + */ +#define KMALLOC_NOT_NORMAL_BITS \ + (__GFP_RECLAIMABLE | \ + (IS_ENABLED(CONFIG_ZONE_DMA) ? __GFP_DMA : 0) | \ + (IS_ENABLED(CONFIG_MEMCG_KMEM) ? __GFP_ACCOUNT : 0)) + static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) { -#ifdef CONFIG_ZONE_DMA /* * The most common case is KMALLOC_NORMAL, so test for it - * with a single branch for both flags. + * with a single branch for all the relevant flags. */ - if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0)) + if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0)) return KMALLOC_NORMAL; /* - * At least one of the flags has to be set. If both are, __GFP_DMA - * is more important. + * At least one of the flags has to be set. Their priorities in + * decreasing order are: + * 1) __GFP_DMA + * 2) __GFP_RECLAIMABLE + * 3) __GFP_ACCOUNT */ - return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM; -#else - return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL; -#endif + if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA)) + return KMALLOC_DMA; + if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || (flags & __GFP_RECLAIMABLE)) + return KMALLOC_RECLAIM; + else + return KMALLOC_CGROUP; } /* diff --git a/mm/internal.h b/mm/internal.h index e8fdb531f887..2946dfa0f245 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -115,6 +115,11 @@ extern void putback_lru_page(struct page *page); */ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); +/* + * in mm/memcontrol.c: + */ +extern bool cgroup_memory_nokmem; + /* * in mm/page_alloc.c */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2508bd97349c..b913950b9f64 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -83,7 +83,7 @@ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); static bool cgroup_memory_nosocket; /* Kernel memory accounting disabled? */ -static bool cgroup_memory_nokmem; +bool cgroup_memory_nokmem; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP diff --git a/mm/slab_common.c b/mm/slab_common.c index 6c0db9f9bd8a..db3f356bf725 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -738,21 +738,25 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) } #ifdef CONFIG_ZONE_DMA -#define INIT_KMALLOC_INFO(__size, __short_size) \ -{ \ - .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ - .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ - .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \ - .size = __size, \ -} +#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz, +#else +#define KMALLOC_DMA_NAME(sz) +#endif + +#ifdef CONFIG_MEMCG_KMEM +#define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz, #else +#define KMALLOC_CGROUP_NAME(sz) +#endif + #define INIT_KMALLOC_INFO(__size, __short_size) \ { \ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ + KMALLOC_CGROUP_NAME(__short_size) \ + KMALLOC_DMA_NAME(__short_size) \ .size = __size, \ } -#endif /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. @@ -838,8 +842,15 @@ void __init setup_kmalloc_cache_index_table(void) static void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) { - if (type == KMALLOC_RECLAIM) + if (type == KMALLOC_RECLAIM) { flags |= SLAB_RECLAIM_ACCOUNT; + } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { + if (cgroup_memory_nokmem) { + kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx]; + return; + } + flags |= SLAB_ACCOUNT; + } kmalloc_caches[type][idx] = create_kmalloc_cache( kmalloc_info[idx].name[type], @@ -857,6 +868,9 @@ void __init create_kmalloc_caches(slab_flags_t flags) int i; enum kmalloc_cache_type type; + /* + * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined + */ for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { if (!kmalloc_caches[type][i]) -- cgit From 13e680fb6a1e7749ef4f4824ed883684ceb838df Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 28 Jun 2021 19:37:41 -0700 Subject: mm: memcg/slab: disable cache merging for KMALLOC_NORMAL caches The KMALLOC_NORMAL (kmalloc-) caches are for unaccounted objects only when CONFIG_MEMCG_KMEM is enabled. To make sure that this condition remains true, we will have to prevent KMALOC_NORMAL caches to merge with other kmem caches. This is now done by setting its refcount to -1 right after its creation. Link: https://lkml.kernel.org/r/20210505200610.13943-4-longman@redhat.com Signed-off-by: Waiman Long Suggested-by: Roman Gushchin Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: Pekka Enberg Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/slab_common.c b/mm/slab_common.c index db3f356bf725..c126e6f6b5a5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -856,6 +856,13 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) kmalloc_info[idx].name[type], kmalloc_info[idx].size, flags, 0, kmalloc_info[idx].size); + + /* + * If CONFIG_MEMCG_KMEM is enabled, disable cache merging for + * KMALLOC_NORMAL caches. + */ + if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL)) + kmalloc_caches[type][idx]->refcount = -1; } /* -- cgit From c5c8b16b596e15471db22ed8ed10aafbf1a11878 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 28 Jun 2021 19:37:44 -0700 Subject: mm: memcontrol: fix root_mem_cgroup charging The below scenario can cause the page counters of the root_mem_cgroup to be out of balance. CPU0: CPU1: objcg = get_obj_cgroup_from_current() obj_cgroup_charge_pages(objcg) memcg_reparent_objcgs() // reparent to root_mem_cgroup WRITE_ONCE(iter->memcg, parent) // memcg == root_mem_cgroup memcg = get_mem_cgroup_from_objcg(objcg) // do not charge to the root_mem_cgroup try_charge(memcg) obj_cgroup_uncharge_pages(objcg) memcg = get_mem_cgroup_from_objcg(objcg) // uncharge from the root_mem_cgroup refill_stock(memcg) drain_stock(memcg) page_counter_uncharge(&memcg->memory) get_obj_cgroup_from_current() never returns a root_mem_cgroup's objcg, so we never explicitly charge the root_mem_cgroup. And it's not going to change. It's all about a race when we got an obj_cgroup pointing at some non-root memcg, but before we were able to charge it, the cgroup was gone, objcg was reparented to the root and so we're skipping the charging. Then we store the objcg pointer and later use to uncharge the root_mem_cgroup. This can cause the page counter to be less than the actual value. Although we do not display the value (mem_cgroup_usage) so there shouldn't be any actual problem, but there is a WARN_ON_ONCE in the page_counter_cancel(). Who knows if it will trigger? So it is better to fix it. Link: https://lkml.kernel.org/r/20210425075410.19255-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Xiongchun Duan Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b913950b9f64..70690fdf53cc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2568,8 +2568,8 @@ out: css_put(&memcg->css); } -static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages) +static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) { unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); int nr_retries = MAX_RECLAIM_RETRIES; @@ -2581,8 +2581,6 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, bool drained = false; unsigned long pflags; - if (mem_cgroup_is_root(memcg)) - return 0; retry: if (consume_stock(memcg, nr_pages)) return 0; @@ -2762,6 +2760,15 @@ done_restock: return 0; } +static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) +{ + if (mem_cgroup_is_root(memcg)) + return 0; + + return try_charge_memcg(memcg, gfp_mask, nr_pages); +} + #if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { @@ -2997,7 +3004,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, memcg = get_mem_cgroup_from_objcg(objcg); - ret = try_charge(memcg, gfp, nr_pages); + ret = try_charge_memcg(memcg, gfp, nr_pages); if (ret) goto out; -- cgit From 8dc87c7d1fec8851925ca96ade0d65d3dcf89cce Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 28 Jun 2021 19:37:47 -0700 Subject: mm: memcontrol: fix page charging in page replacement Patch series "memcontrol code cleanup and simplification", v3. This patch (of 8): The pages aren't accounted at the root level, so do not charge the page to the root memcg in page replacement. Although we do not display the value (mem_cgroup_usage) so there shouldn't be any actual problem, but there is a WARN_ON_ONCE in the page_counter_cancel(). Who knows if it will trigger? So it is better to fix it. Link: https://lkml.kernel.org/r/20210417043538.9793-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20210417043538.9793-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 70690fdf53cc..239f69ed1ac1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6984,9 +6984,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) /* Force-charge the new page. The old one will be freed soon */ nr_pages = thp_nr_pages(newpage); - page_counter_charge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_charge(&memcg->memsw, nr_pages); + if (!mem_cgroup_is_root(memcg)) { + page_counter_charge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_charge(&memcg->memsw, nr_pages); + } css_get(&memcg->css); commit_charge(newpage, memcg); -- cgit From 2884b6b7eed4fc14c0630fb16e56a4c66c786d33 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 28 Jun 2021 19:37:50 -0700 Subject: mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm When mm is NULL, we do not need to hold rcu lock and call css_tryget for the root memcg. And we also do not need to check !mm in every loop of while. So bail out early when !mm. Link: https://lkml.kernel.org/r/20210417043538.9793-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 239f69ed1ac1..babbaf49ee36 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -919,20 +919,23 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) if (mem_cgroup_disabled()) return NULL; + /* + * Page cache insertions can happen without an + * actual mm context, e.g. during disk probing + * on boot, loopback IO, acct() writes etc. + * + * No need to css_get on root memcg as the reference + * counting is disabled on the root level in the + * cgroup core. See CSS_NO_REF. + */ + if (unlikely(!mm)) + return root_mem_cgroup; + rcu_read_lock(); do { - /* - * Page cache insertions can happen without an - * actual mm context, e.g. during disk probing - * on boot, loopback IO, acct() writes etc. - */ - if (unlikely(!mm)) + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) memcg = root_mem_cgroup; - else { - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) - memcg = root_mem_cgroup; - } } while (!css_tryget(&memcg->css)); rcu_read_unlock(); return memcg; -- cgit From a984226f457f849eb9c4ce727eeaa3b5080597d8 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 28 Jun 2021 19:37:53 -0700 Subject: mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec All the callers of mem_cgroup_page_lruvec() just pass page_pgdat(page) as the 2nd parameter to it (except isolate_migratepages_block()). But for isolate_migratepages_block(), the page_pgdat(page) is also equal to the local variable of @pgdat. So mem_cgroup_page_lruvec() do not need the pgdat parameter. Just remove it to simplify the code. Link: https://lkml.kernel.org/r/20210417043538.9793-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 +++++----- mm/compaction.c | 2 +- mm/memcontrol.c | 9 +++------ mm/swap.c | 2 +- mm/workingset.c | 2 +- 5 files changed, 11 insertions(+), 14 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c193be760709..f2a5aaba3577 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -743,13 +743,12 @@ out: /** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page - * @pgdat: pgdat of the page * * This function relies on page->mem_cgroup being stable. */ -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, - struct pglist_data *pgdat) +static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) { + pg_data_t *pgdat = page_pgdat(page); struct mem_cgroup *memcg = page_memcg(page); VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page); @@ -1221,9 +1220,10 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, return &pgdat->__lruvec; } -static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, - struct pglist_data *pgdat) +static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) { + pg_data_t *pgdat = page_pgdat(page); + return &pgdat->__lruvec; } diff --git a/mm/compaction.c b/mm/compaction.c index 84fde270ae74..7d41b58fb17c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1028,7 +1028,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!TestClearPageLRU(page)) goto isolate_fail_put; - lruvec = mem_cgroup_page_lruvec(page, pgdat); + lruvec = mem_cgroup_page_lruvec(page); /* If we already hold the lock, we can skip some rechecking */ if (lruvec != locked) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index babbaf49ee36..946a9a483e71 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1199,9 +1199,8 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) struct lruvec *lock_page_lruvec(struct page *page) { struct lruvec *lruvec; - struct pglist_data *pgdat = page_pgdat(page); - lruvec = mem_cgroup_page_lruvec(page, pgdat); + lruvec = mem_cgroup_page_lruvec(page); spin_lock(&lruvec->lru_lock); lruvec_memcg_debug(lruvec, page); @@ -1212,9 +1211,8 @@ struct lruvec *lock_page_lruvec(struct page *page) struct lruvec *lock_page_lruvec_irq(struct page *page) { struct lruvec *lruvec; - struct pglist_data *pgdat = page_pgdat(page); - lruvec = mem_cgroup_page_lruvec(page, pgdat); + lruvec = mem_cgroup_page_lruvec(page); spin_lock_irq(&lruvec->lru_lock); lruvec_memcg_debug(lruvec, page); @@ -1225,9 +1223,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page) struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) { struct lruvec *lruvec; - struct pglist_data *pgdat = page_pgdat(page); - lruvec = mem_cgroup_page_lruvec(page, pgdat); + lruvec = mem_cgroup_page_lruvec(page); spin_lock_irqsave(&lruvec->lru_lock, *flags); lruvec_memcg_debug(lruvec, page); diff --git a/mm/swap.c b/mm/swap.c index dfb48cf9c2c9..18cc9e63515b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -313,7 +313,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) void lru_note_cost_page(struct page *page) { - lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)), + lru_note_cost(mem_cgroup_page_lruvec(page), page_is_file_lru(page), thp_nr_pages(page)); } diff --git a/mm/workingset.c b/mm/workingset.c index b7cdeca5a76d..4f7a306ce75a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -408,7 +408,7 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + lruvec = mem_cgroup_page_lruvec(page); workingset_age_nonresident(lruvec, thp_nr_pages(page)); out: rcu_read_unlock(); -- cgit From f2e4d28dd9f6478dd54d47b91edc3fe62c019968 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 28 Jun 2021 19:37:56 -0700 Subject: mm: memcontrol: simplify lruvec_holds_page_lru_lock We already have a helper lruvec_memcg() to get the memcg from lruvec, we do not need to do it ourselves in the lruvec_holds_page_lru_lock(). So use lruvec_memcg() instead. And if mem_cgroup_disabled() returns false, the page_memcg(page) (the LRU pages) cannot be NULL. So remove the odd logic of "memcg = page_memcg(page) ? : root_mem_cgroup". And use lruvec_pgdat to simplify the code. We can have a single definition for this function that works for !CONFIG_MEMCG, CONFIG_MEMCG + mem_cgroup_disabled() and CONFIG_MEMCG. Link: https://lkml.kernel.org/r/20210417043538.9793-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f2a5aaba3577..2fc728492c9b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -755,22 +755,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) return mem_cgroup_lruvec(memcg, pgdat); } -static inline bool lruvec_holds_page_lru_lock(struct page *page, - struct lruvec *lruvec) -{ - pg_data_t *pgdat = page_pgdat(page); - const struct mem_cgroup *memcg; - struct mem_cgroup_per_node *mz; - - if (mem_cgroup_disabled()) - return lruvec == &pgdat->__lruvec; - - mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - memcg = page_memcg(page) ? : root_mem_cgroup; - - return lruvec->pgdat == pgdat && mz->memcg == memcg; -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); @@ -1227,14 +1211,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page) return &pgdat->__lruvec; } -static inline bool lruvec_holds_page_lru_lock(struct page *page, - struct lruvec *lruvec) -{ - pg_data_t *pgdat = page_pgdat(page); - - return lruvec == &pgdat->__lruvec; -} - static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) { } @@ -1516,6 +1492,13 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, spin_unlock_irqrestore(&lruvec->lru_lock, flags); } +static inline bool lruvec_holds_page_lru_lock(struct page *page, + struct lruvec *lruvec) +{ + return lruvec_pgdat(lruvec) == page_pgdat(page) && + lruvec_memcg(lruvec) == page_memcg(page); +} + /* Don't lock again iff page's lruvec locked */ static inline struct lruvec *relock_page_lruvec_irq(struct page *page, struct lruvec *locked_lruvec) -- cgit From 7467c39128bda1d58af08aaeb0c7ba54d0ec87ae Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 28 Jun 2021 19:37:59 -0700 Subject: mm: memcontrol: rename lruvec_holds_page_lru_lock to page_matches_lruvec lruvec_holds_page_lru_lock() doesn't check anything about locking and is used to check whether the page belongs to the lruvec. So rename it to page_matches_lruvec(). Link: https://lkml.kernel.org/r/20210417043538.9793-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Michal Hocko Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ++++---- mm/vmscan.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2fc728492c9b..0ce97eff79e2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1492,8 +1492,8 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, spin_unlock_irqrestore(&lruvec->lru_lock, flags); } -static inline bool lruvec_holds_page_lru_lock(struct page *page, - struct lruvec *lruvec) +/* Test requires a stable page->memcg binding, see page_memcg() */ +static inline bool page_matches_lruvec(struct page *page, struct lruvec *lruvec) { return lruvec_pgdat(lruvec) == page_pgdat(page) && lruvec_memcg(lruvec) == page_memcg(page); @@ -1504,7 +1504,7 @@ static inline struct lruvec *relock_page_lruvec_irq(struct page *page, struct lruvec *locked_lruvec) { if (locked_lruvec) { - if (lruvec_holds_page_lru_lock(page, locked_lruvec)) + if (page_matches_lruvec(page, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irq(locked_lruvec); @@ -1518,7 +1518,7 @@ static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page, struct lruvec *locked_lruvec, unsigned long *flags) { if (locked_lruvec) { - if (lruvec_holds_page_lru_lock(page, locked_lruvec)) + if (page_matches_lruvec(page, locked_lruvec)) return locked_lruvec; unlock_page_lruvec_irqrestore(locked_lruvec, *flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index 5199b9696bab..ec93d4fd5a6b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2063,7 +2063,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, * All pages were isolated from the same lruvec (and isolation * inhibits memcg migration). */ - VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page); + VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page); add_page_to_lru_list(page, lruvec); nr_pages = thp_nr_pages(page); nr_moved += nr_pages; -- cgit From 9838354e16a2a920d5a228559850d10fa588a18d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 28 Jun 2021 19:38:03 -0700 Subject: mm: memcontrol: simplify the logic of objcg pinning memcg The obj_cgroup_release() and memcg_reparent_objcgs() are serialized by the css_set_lock. We do not need to care about objcg->memcg being released in the process of obj_cgroup_release(). So there is no need to pin memcg before releasing objcg. Remove those pinning logic to simplfy the code. There are only two places that modifies the objcg->memcg. One is the initialization to objcg->memcg in the memcg_online_kmem(), another is objcgs reparenting in the memcg_reparent_objcgs(). It is also impossible for the two to run in parallel. So xchg() is unnecessary and it is enough to use WRITE_ONCE(). Link: https://lkml.kernel.org/r/20210417043538.9793-7-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 946a9a483e71..c79b6926fe83 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -261,7 +261,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, static void obj_cgroup_release(struct percpu_ref *ref) { struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); - struct mem_cgroup *memcg; unsigned int nr_bytes; unsigned int nr_pages; unsigned long flags; @@ -291,11 +290,9 @@ static void obj_cgroup_release(struct percpu_ref *ref) nr_pages = nr_bytes >> PAGE_SHIFT; spin_lock_irqsave(&css_set_lock, flags); - memcg = obj_cgroup_memcg(objcg); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); list_del(&objcg->list); - mem_cgroup_put(memcg); spin_unlock_irqrestore(&css_set_lock, flags); percpu_ref_exit(ref); @@ -330,17 +327,12 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, spin_lock_irq(&css_set_lock); - /* Move active objcg to the parent's list */ - xchg(&objcg->memcg, parent); - css_get(&parent->css); - list_add(&objcg->list, &parent->objcg_list); - - /* Move already reparented objcgs to the parent's list */ - list_for_each_entry(iter, &memcg->objcg_list, list) { - css_get(&parent->css); - xchg(&iter->memcg, parent); - css_put(&memcg->css); - } + /* 1) Ready to reparent active objcg. */ + list_add(&objcg->list, &memcg->objcg_list); + /* 2) Reparent active objcg and already reparented objcgs to parent. */ + list_for_each_entry(iter, &memcg->objcg_list, list) + WRITE_ONCE(iter->memcg, parent); + /* 3) Move already reparented objcgs to the parent's list */ list_splice(&memcg->objcg_list, &parent->objcg_list); spin_unlock_irq(&css_set_lock); -- cgit From 271dd6b1f636a99a3a77889935296c063f5a3cbe Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 28 Jun 2021 19:38:06 -0700 Subject: mm: memcontrol: move obj_cgroup_uncharge_pages() out of css_set_lock The css_set_lock is used to guard the list of inherited objcgs. So there is no need to uncharge kernel memory under css_set_lock. Just move it out of the lock. Link: https://lkml.kernel.org/r/20210417043538.9793-8-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c79b6926fe83..f7a552eb3e9d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -289,9 +289,10 @@ static void obj_cgroup_release(struct percpu_ref *ref) WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); nr_pages = nr_bytes >> PAGE_SHIFT; - spin_lock_irqsave(&css_set_lock, flags); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); + + spin_lock_irqsave(&css_set_lock, flags); list_del(&objcg->list); spin_unlock_irqrestore(&css_set_lock, flags); -- cgit From 9ef56b78b888c2fa15b6140fbdb88853d7d4fecd Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 28 Jun 2021 19:38:09 -0700 Subject: mm: vmscan: remove noinline_for_stack The noinline_for_stack is introduced by commit 666356297ec4 ("vmscan: set up pagevec as late as possible in shrink_inactive_list()"), its purpose is to delay the allocation of pagevec as late as possible to save stack memory. But the commit 2bcf88796381 ("mm: take pagevecs off reclaim stack") replace pagevecs by lists of pages_to_free. So we do not need noinline_for_stack, just remove it (let the compiler decide whether to inline). Link: https://lkml.kernel.org/r/20210417043538.9793-9-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index ec93d4fd5a6b..f96d62159720 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2015,8 +2015,8 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, * * Returns the number of pages moved to the given lruvec. */ -static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, - struct list_head *list) +static unsigned int move_pages_to_lru(struct lruvec *lruvec, + struct list_head *list) { int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); @@ -2096,7 +2096,7 @@ static int current_may_throttle(void) * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ -static noinline_for_stack unsigned long +static unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { -- cgit From b51478a0b3c7040bfcadf6e2e04df5ddde59fd98 Mon Sep 17 00:00:00 2001 From: wenhuizhang Date: Mon, 28 Jun 2021 19:38:12 -0700 Subject: memcontrol: use flexible-array member Change deprecated zero-length-and-one-element-arrays into flexible array member.Zero-length and one-element arrays detected by Lukas's CodeChecker. Zero/one element arrays cause undefined behaviours if sizeof() used. Link: https://lkml.kernel.org/r/20210518200910.29912-1-wenhui@gwmail.gwu.edu Signed-off-by: wenhuizhang Reviewed-by: Muchun Song Acked-by: Michal Hocko Cc: Shakeel Butt Cc: Johannes Weiner Cc: Roman Gushchin Cc: Muchun Song Cc: Yang Shi Cc: Alex Shi Cc: Alexander Duyck Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0ce97eff79e2..3cc18c2176e7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -349,8 +349,7 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif - struct mem_cgroup_per_node *nodeinfo[0]; - /* WARNING: nodeinfo must be the last member here */ + struct mem_cgroup_per_node *nodeinfo[]; }; /* -- cgit From 87579e9b7d8dc36e7cfc40c03f1ae5634e16e2c5 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Mon, 28 Jun 2021 19:38:15 -0700 Subject: loop: use worker per cgroup instead of kworker Patch series "Charge loop device i/o to issuing cgroup", v14. The loop device runs all i/o to the backing file on a separate kworker thread which results in all i/o being charged to the root cgroup. This allows a loop device to be used to trivially bypass resource limits and other policy. This patch series fixes this gap in accounting. A simple script to demonstrate this behavior on cgroupv2 machine: ''' #!/bin/bash set -e CGROUP=/sys/fs/cgroup/test.slice LOOP_DEV=/dev/loop0 if [[ ! -d $CGROUP ]] then sudo mkdir $CGROUP fi grep oom_kill $CGROUP/memory.events # Set a memory limit, write more than that limit to tmpfs -> OOM kill sudo unshare -m bash -c " echo \$\$ > $CGROUP/cgroup.procs; echo 0 > $CGROUP/memory.swap.max; echo 64M > $CGROUP/memory.max; mount -t tmpfs -o size=512m tmpfs /tmp; dd if=/dev/zero of=/tmp/file bs=1M count=256" || true grep oom_kill $CGROUP/memory.events # Set a memory limit, write more than that limit through loopback # device -> no OOM kill sudo unshare -m bash -c " echo \$\$ > $CGROUP/cgroup.procs; echo 0 > $CGROUP/memory.swap.max; echo 64M > $CGROUP/memory.max; mount -t tmpfs -o size=512m tmpfs /tmp; truncate -s 512m /tmp/backing_file losetup $LOOP_DEV /tmp/backing_file dd if=/dev/zero of=$LOOP_DEV bs=1M count=256; losetup -D $LOOP_DEV" || true grep oom_kill $CGROUP/memory.events ''' Naively charging cgroups could result in priority inversions through the single kworker thread in the case where multiple cgroups are reading/writing to the same loop device. This patch series does some minor modification to the loop driver so that each cgroup can make forward progress independently to avoid this inversion. With this patch series applied, the above script triggers OOM kills when writing through the loop device as expected. This patch (of 3): Existing uses of loop device may have multiple cgroups reading/writing to the same device. Simply charging resources for I/O to the backing file could result in priority inversion where one cgroup gets synchronously blocked, holding up all other I/O to the loop device. In order to avoid this priority inversion, we use a single workqueue where each work item is a "struct loop_worker" which contains a queue of struct loop_cmds to issue. The loop device maintains a tree mapping blk css_id -> loop_worker. This allows each cgroup to independently make forward progress issuing I/O to the backing file. There is also a single queue for I/O associated with the rootcg which can be used in cases of extreme memory shortage where we cannot allocate a loop_worker. The locking for the tree and queues is fairly heavy handed - we acquire a per-loop-device spinlock any time either is accessed. The existing implementation serializes all I/O through a single thread anyways, so I don't believe this is any worse. [colin.king@canonical.com: fixes] Link: https://lkml.kernel.org/r/20210610173944.1203706-1-schatzberg.dan@gmail.com Link: https://lkml.kernel.org/r/20210610173944.1203706-2-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Reviewed-by: Ming Lei Acked-by: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Chris Down Cc: Shakeel Butt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/loop.c | 209 +++++++++++++++++++++++++++++++++++++++++++-------- drivers/block/loop.h | 12 ++- 2 files changed, 187 insertions(+), 34 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 76e12f3482a9..54ed3ebbbc37 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -71,7 +71,6 @@ #include #include #include -#include #include #include #include @@ -84,6 +83,8 @@ #include +#define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) + static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); @@ -921,27 +922,95 @@ static void loop_config_discard(struct loop_device *lo) q->limits.discard_alignment = 0; } -static void loop_unprepare_queue(struct loop_device *lo) +struct loop_worker { + struct rb_node rb_node; + struct work_struct work; + struct list_head cmd_list; + struct list_head idle_list; + struct loop_device *lo; + struct cgroup_subsys_state *css; + unsigned long last_ran_at; +}; + +static void loop_workfn(struct work_struct *work); +static void loop_rootcg_workfn(struct work_struct *work); +static void loop_free_idle_workers(struct timer_list *timer); + +#ifdef CONFIG_BLK_CGROUP +static inline int queue_on_root_worker(struct cgroup_subsys_state *css) { - kthread_flush_worker(&lo->worker); - kthread_stop(lo->worker_task); + return !css || css == blkcg_root_css; } - -static int loop_kthread_worker_fn(void *worker_ptr) +#else +static inline int queue_on_root_worker(struct cgroup_subsys_state *css) { - current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; - return kthread_worker_fn(worker_ptr); + return !css; } +#endif -static int loop_prepare_queue(struct loop_device *lo) +static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { - kthread_init_worker(&lo->worker); - lo->worker_task = kthread_run(loop_kthread_worker_fn, - &lo->worker, "loop%d", lo->lo_number); - if (IS_ERR(lo->worker_task)) - return -ENOMEM; - set_user_nice(lo->worker_task, MIN_NICE); - return 0; + struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL; + struct loop_worker *cur_worker, *worker = NULL; + struct work_struct *work; + struct list_head *cmd_list; + + spin_lock_irq(&lo->lo_work_lock); + + if (queue_on_root_worker(cmd->css)) + goto queue_work; + + node = &lo->worker_tree.rb_node; + + while (*node) { + parent = *node; + cur_worker = container_of(*node, struct loop_worker, rb_node); + if (cur_worker->css == cmd->css) { + worker = cur_worker; + break; + } else if ((long)cur_worker->css < (long)cmd->css) { + node = &(*node)->rb_left; + } else { + node = &(*node)->rb_right; + } + } + if (worker) + goto queue_work; + + worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN); + /* + * In the event we cannot allocate a worker, just queue on the + * rootcg worker + */ + if (!worker) + goto queue_work; + + worker->css = cmd->css; + css_get(worker->css); + INIT_WORK(&worker->work, loop_workfn); + INIT_LIST_HEAD(&worker->cmd_list); + INIT_LIST_HEAD(&worker->idle_list); + worker->lo = lo; + rb_link_node(&worker->rb_node, parent, node); + rb_insert_color(&worker->rb_node, &lo->worker_tree); +queue_work: + if (worker) { + /* + * We need to remove from the idle list here while + * holding the lock so that the idle timer doesn't + * free the worker + */ + if (!list_empty(&worker->idle_list)) + list_del_init(&worker->idle_list); + work = &worker->work; + cmd_list = &worker->cmd_list; + } else { + work = &lo->rootcg_work; + cmd_list = &lo->rootcg_cmd_list; + } + list_add_tail(&cmd->list_entry, cmd_list); + queue_work(lo->workqueue, work); + spin_unlock_irq(&lo->lo_work_lock); } static void loop_update_rotational(struct loop_device *lo) @@ -1127,12 +1196,23 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, !file->f_op->write_iter) lo->lo_flags |= LO_FLAGS_READ_ONLY; - error = loop_prepare_queue(lo); - if (error) + lo->workqueue = alloc_workqueue("loop%d", + WQ_UNBOUND | WQ_FREEZABLE, + 0, + lo->lo_number); + if (!lo->workqueue) { + error = -ENOMEM; goto out_unlock; + } set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); + INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn); + INIT_LIST_HEAD(&lo->rootcg_cmd_list); + INIT_LIST_HEAD(&lo->idle_worker_list); + lo->worker_tree = RB_ROOT; + timer_setup(&lo->timer, loop_free_idle_workers, + TIMER_DEFERRABLE); lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; lo->lo_backing_file = file; @@ -1200,6 +1280,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) int err = 0; bool partscan = false; int lo_number; + struct loop_worker *pos, *worker; mutex_lock(&lo->lo_mutex); if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) { @@ -1219,6 +1300,18 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) /* freeze request queue during the transition */ blk_mq_freeze_queue(lo->lo_queue); + destroy_workqueue(lo->workqueue); + spin_lock_irq(&lo->lo_work_lock); + list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, + idle_list) { + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &lo->worker_tree); + css_put(worker->css); + kfree(worker); + } + spin_unlock_irq(&lo->lo_work_lock); + del_timer_sync(&lo->timer); + spin_lock_irq(&lo->lo_lock); lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); @@ -1255,7 +1348,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev; lo_number = lo->lo_number; - loop_unprepare_queue(lo); out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) { @@ -2015,7 +2107,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, } else #endif cmd->css = NULL; - kthread_queue_work(&lo->worker, &cmd->work); + loop_queue_work(lo, cmd); return BLK_STS_OK; } @@ -2045,26 +2137,82 @@ static void loop_handle_cmd(struct loop_cmd *cmd) } } -static void loop_queue_work(struct kthread_work *work) +static void loop_set_timer(struct loop_device *lo) +{ + timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT); +} + +static void loop_process_work(struct loop_worker *worker, + struct list_head *cmd_list, struct loop_device *lo) { - struct loop_cmd *cmd = - container_of(work, struct loop_cmd, work); + int orig_flags = current->flags; + struct loop_cmd *cmd; - loop_handle_cmd(cmd); + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; + spin_lock_irq(&lo->lo_work_lock); + while (!list_empty(cmd_list)) { + cmd = container_of( + cmd_list->next, struct loop_cmd, list_entry); + list_del(cmd_list->next); + spin_unlock_irq(&lo->lo_work_lock); + + loop_handle_cmd(cmd); + cond_resched(); + + spin_lock_irq(&lo->lo_work_lock); + } + + /* + * We only add to the idle list if there are no pending cmds + * *and* the worker will not run again which ensures that it + * is safe to free any worker on the idle list + */ + if (worker && !work_pending(&worker->work)) { + worker->last_ran_at = jiffies; + list_add_tail(&worker->idle_list, &lo->idle_worker_list); + loop_set_timer(lo); + } + spin_unlock_irq(&lo->lo_work_lock); + current->flags = orig_flags; } -static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) +static void loop_workfn(struct work_struct *work) { - struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct loop_worker *worker = + container_of(work, struct loop_worker, work); + loop_process_work(worker, &worker->cmd_list, worker->lo); +} - kthread_init_work(&cmd->work, loop_queue_work); - return 0; +static void loop_rootcg_workfn(struct work_struct *work) +{ + struct loop_device *lo = + container_of(work, struct loop_device, rootcg_work); + loop_process_work(NULL, &lo->rootcg_cmd_list, lo); +} + +static void loop_free_idle_workers(struct timer_list *timer) +{ + struct loop_device *lo = container_of(timer, struct loop_device, timer); + struct loop_worker *pos, *worker; + + spin_lock_irq(&lo->lo_work_lock); + list_for_each_entry_safe(worker, pos, &lo->idle_worker_list, + idle_list) { + if (time_is_after_jiffies(worker->last_ran_at + + LOOP_IDLE_WORKER_TIMEOUT)) + break; + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &lo->worker_tree); + css_put(worker->css); + kfree(worker); + } + if (!list_empty(&lo->idle_worker_list)) + loop_set_timer(lo); + spin_unlock_irq(&lo->lo_work_lock); } static const struct blk_mq_ops loop_mq_ops = { .queue_rq = loop_queue_rq, - .init_request = loop_init_request, .complete = lo_complete_rq, }; @@ -2153,6 +2301,7 @@ static int loop_add(struct loop_device **l, int i) mutex_init(&lo->lo_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); + spin_lock_init(&lo->lo_work_lock); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; disk->fops = &lo_fops; diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 5beb959b94d3..f81c01bde5c0 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -14,7 +14,6 @@ #include #include #include -#include #include /* Possible states of device */ @@ -55,8 +54,13 @@ struct loop_device { spinlock_t lo_lock; int lo_state; - struct kthread_worker worker; - struct task_struct *worker_task; + spinlock_t lo_work_lock; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct list_head idle_worker_list; + struct rb_root worker_tree; + struct timer_list timer; bool use_dio; bool sysfs_inited; @@ -67,7 +71,7 @@ struct loop_device { }; struct loop_cmd { - struct kthread_work work; + struct list_head list_entry; bool use_aio; /* use AIO interface to handle I/O */ atomic_t ref; /* only for aio */ long ret; -- cgit From 04f94e3fbe1afcb815d7c7ace78c6779772aa837 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Mon, 28 Jun 2021 19:38:18 -0700 Subject: mm: charge active memcg when no mm is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit set_active_memcg() worked for kernel allocations but was silently ignored for user pages. This patch establishes a precedence order for who gets charged: 1. If there is a memcg associated with the page already, that memcg is charged. This happens during swapin. 2. If an explicit mm is passed, mm->memcg is charged. This happens during page faults, which can be triggered in remote VMs (eg gup). 3. Otherwise consult the current process context. If there is an active_memcg, use that. Otherwise, current->mm->memcg. Previously, if a NULL mm was passed to mem_cgroup_charge (case 3) it would always charge the root cgroup. Now it looks up the active_memcg first (falling back to charging the root cgroup if not set). Link: https://lkml.kernel.org/r/20210610173944.1203706-3-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Acked-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: Chris Down Acked-by: Jens Axboe Reviewed-by: Shakeel Butt Reviewed-by: Michal Koutný Cc: Michal Hocko Cc: Ming Lei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- mm/memcontrol.c | 41 +++++++++++++++++++++++++++-------------- mm/shmem.c | 4 ++-- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 66f7e9fdfbc4..ac82a93d4f38 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -872,7 +872,7 @@ noinline int __add_to_page_cache_locked(struct page *page, page->index = offset; if (!huge) { - error = mem_cgroup_charge(page, current->mm, gfp); + error = mem_cgroup_charge(page, NULL, gfp); if (error) goto error; charged = true; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f7a552eb3e9d..8f3244e59b30 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -897,13 +897,24 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) } EXPORT_SYMBOL(mem_cgroup_from_task); +static __always_inline struct mem_cgroup *active_memcg(void) +{ + if (in_interrupt()) + return this_cpu_read(int_active_memcg); + else + return current->active_memcg; +} + /** * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. * @mm: mm from which memcg should be extracted. It can be NULL. * - * Obtain a reference on mm->memcg and returns it if successful. Otherwise - * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is - * returned. + * Obtain a reference on mm->memcg and returns it if successful. If mm + * is NULL, then the memcg is chosen as follows: + * 1) The active memcg, if set. + * 2) current->mm->memcg, if available + * 3) root memcg + * If mem_cgroup is disabled, NULL is returned. */ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { @@ -921,8 +932,17 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) * counting is disabled on the root level in the * cgroup core. See CSS_NO_REF. */ - if (unlikely(!mm)) - return root_mem_cgroup; + if (unlikely(!mm)) { + memcg = active_memcg(); + if (unlikely(memcg)) { + /* remote memcg must hold a ref */ + css_get(&memcg->css); + return memcg; + } + mm = current->mm; + if (unlikely(!mm)) + return root_mem_cgroup; + } rcu_read_lock(); do { @@ -935,14 +955,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) } EXPORT_SYMBOL(get_mem_cgroup_from_mm); -static __always_inline struct mem_cgroup *active_memcg(void) -{ - if (in_interrupt()) - return this_cpu_read(int_active_memcg); - else - return current->active_memcg; -} - static __always_inline bool memcg_kmem_bypass(void) { /* Allow remote memcg charging from any context. */ @@ -6711,7 +6723,8 @@ out: * @gfp_mask: reclaim mode * * Try to charge @page to the memcg that @mm belongs to, reclaiming - * pages according to @gfp_mask if necessary. + * pages according to @gfp_mask if necessary. if @mm is NULL, try to + * charge to the active memcg. * * Do not use this for pages allocated for swapin. * diff --git a/mm/shmem.c b/mm/shmem.c index 53f21016608e..e72931b9246c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1695,7 +1695,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; + struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; struct swap_info_struct *si; struct page *page = NULL; swp_entry_t swap; @@ -1828,7 +1828,7 @@ repeat: } sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : current->mm; + charge_mm = vma ? vma->vm_mm : NULL; page = pagecache_get_page(mapping, index, FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0); -- cgit From c74d40e8b5e2ac5eee1ca45b12d3e174915f1d88 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Mon, 28 Jun 2021 19:38:21 -0700 Subject: loop: charge i/o to mem and blk cg The current code only associates with the existing blkcg when aio is used to access the backing file. This patch covers all types of i/o to the backing file and also associates the memcg so if the backing file is on tmpfs, memory is charged appropriately. This patch also exports cgroup_get_e_css and int_active_memcg so it can be used by the loop module. Link: https://lkml.kernel.org/r/20210610173944.1203706-4-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Acked-by: Johannes Weiner Acked-by: Jens Axboe Cc: Chris Down Cc: Michal Hocko Cc: Ming Lei Cc: Shakeel Butt Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/loop.c | 61 +++++++++++++++++++++++++++++++--------------- drivers/block/loop.h | 3 ++- include/linux/memcontrol.h | 6 +++++ kernel/cgroup/cgroup.c | 1 + mm/memcontrol.c | 1 + 5 files changed, 51 insertions(+), 21 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 54ed3ebbbc37..452c7437e1f0 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -78,6 +78,7 @@ #include #include #include +#include #include "loop.h" @@ -516,8 +517,6 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); - if (cmd->css) - css_put(cmd->css); cmd->ret = ret; lo_rw_aio_do_completion(cmd); } @@ -578,8 +577,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_complete = lo_rw_aio_complete; cmd->iocb.ki_flags = IOCB_DIRECT; cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); - if (cmd->css) - kthread_associate_blkcg(cmd->css); if (rw == WRITE) ret = call_write_iter(file, &cmd->iocb, &iter); @@ -587,7 +584,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, ret = call_read_iter(file, &cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); - kthread_associate_blkcg(NULL); if (ret != -EIOCBQUEUED) cmd->iocb.ki_complete(&cmd->iocb, ret, 0); @@ -928,7 +924,7 @@ struct loop_worker { struct list_head cmd_list; struct list_head idle_list; struct loop_device *lo; - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *blkcg_css; unsigned long last_ran_at; }; @@ -957,7 +953,7 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) spin_lock_irq(&lo->lo_work_lock); - if (queue_on_root_worker(cmd->css)) + if (queue_on_root_worker(cmd->blkcg_css)) goto queue_work; node = &lo->worker_tree.rb_node; @@ -965,10 +961,10 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) while (*node) { parent = *node; cur_worker = container_of(*node, struct loop_worker, rb_node); - if (cur_worker->css == cmd->css) { + if (cur_worker->blkcg_css == cmd->blkcg_css) { worker = cur_worker; break; - } else if ((long)cur_worker->css < (long)cmd->css) { + } else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) { node = &(*node)->rb_left; } else { node = &(*node)->rb_right; @@ -980,13 +976,18 @@ static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN); /* * In the event we cannot allocate a worker, just queue on the - * rootcg worker + * rootcg worker and issue the I/O as the rootcg */ - if (!worker) + if (!worker) { + cmd->blkcg_css = NULL; + if (cmd->memcg_css) + css_put(cmd->memcg_css); + cmd->memcg_css = NULL; goto queue_work; + } - worker->css = cmd->css; - css_get(worker->css); + worker->blkcg_css = cmd->blkcg_css; + css_get(worker->blkcg_css); INIT_WORK(&worker->work, loop_workfn); INIT_LIST_HEAD(&worker->cmd_list); INIT_LIST_HEAD(&worker->idle_list); @@ -1306,7 +1307,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) idle_list) { list_del(&worker->idle_list); rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->css); + css_put(worker->blkcg_css); kfree(worker); } spin_unlock_irq(&lo->lo_work_lock); @@ -2100,13 +2101,18 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, } /* always use the first bio's css */ + cmd->blkcg_css = NULL; + cmd->memcg_css = NULL; #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { - cmd->css = &bio_blkcg(rq->bio)->css; - css_get(cmd->css); - } else + if (rq->bio && rq->bio->bi_blkg) { + cmd->blkcg_css = &bio_blkcg(rq->bio)->css; +#ifdef CONFIG_MEMCG + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); +#endif + } #endif - cmd->css = NULL; loop_queue_work(lo, cmd); return BLK_STS_OK; @@ -2118,13 +2124,28 @@ static void loop_handle_cmd(struct loop_cmd *cmd) const bool write = op_is_write(req_op(rq)); struct loop_device *lo = rq->q->queuedata; int ret = 0; + struct mem_cgroup *old_memcg = NULL; if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { ret = -EIO; goto failed; } + if (cmd->blkcg_css) + kthread_associate_blkcg(cmd->blkcg_css); + if (cmd->memcg_css) + old_memcg = set_active_memcg( + mem_cgroup_from_css(cmd->memcg_css)); + ret = do_req_filebacked(lo, rq); + + if (cmd->blkcg_css) + kthread_associate_blkcg(NULL); + + if (cmd->memcg_css) { + set_active_memcg(old_memcg); + css_put(cmd->memcg_css); + } failed: /* complete non-aio request */ if (!cmd->use_aio || ret) { @@ -2203,7 +2224,7 @@ static void loop_free_idle_workers(struct timer_list *timer) break; list_del(&worker->idle_list); rb_erase(&worker->rb_node, &lo->worker_tree); - css_put(worker->css); + css_put(worker->blkcg_css); kfree(worker); } if (!list_empty(&lo->idle_worker_list)) diff --git a/drivers/block/loop.h b/drivers/block/loop.h index f81c01bde5c0..1988899db63a 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -77,7 +77,8 @@ struct loop_cmd { long ret; struct kiocb iocb; struct bio_vec *bvec; - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *blkcg_css; + struct cgroup_subsys_state *memcg_css; }; /* Support for loadable transfer modules */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3cc18c2176e7..1de3859233a6 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1230,6 +1230,12 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return NULL; } +static inline +struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) +{ + return NULL; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 21ecc6ee6a6d..9cc8c3a686b1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -577,6 +577,7 @@ out_unlock: rcu_read_unlock(); return css; } +EXPORT_SYMBOL_GPL(cgroup_get_e_css); static void cgroup_get_live(struct cgroup *cgrp) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8f3244e59b30..4ee243ce6135 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -78,6 +78,7 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; /* Active memory cgroup to use from an interrupt context */ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); +EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); /* Socket memory accounting disabled? */ static bool cgroup_memory_nosocket; -- cgit From 6a1803bb582c50909a7f6cc4153360eaf5ae8fc8 Mon Sep 17 00:00:00 2001 From: Huilong Deng Date: Mon, 28 Jun 2021 19:38:24 -0700 Subject: mm: memcontrol: remove trailing semicolon in macros Macros should not use a trailing semicolon. Link: https://lkml.kernel.org/r/20210614091530.22117-1-denghuilong@cdjrlc.com Signed-off-by: Huilong Deng Reviewed-by: Andrew Morton Reviewed-by: Shakeel Butt Cc: Roman Gushchin Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1de3859233a6..6d66037be646 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -192,7 +192,7 @@ enum memcg_kmem_state { struct memcg_padding { char x[0]; } ____cacheline_internodealigned_in_smp; -#define MEMCG_PADDING(name) struct memcg_padding name; +#define MEMCG_PADDING(name) struct memcg_padding name #else #define MEMCG_PADDING(name) #endif -- cgit From 8fa207525f6ae241c19cbe4c470c5cb9bea4aab0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 28 Jun 2021 19:38:28 -0700 Subject: perf: MAP_EXECUTABLE does not indicate VM_MAYEXEC Patch series "perf/binfmt/mm: remove in-tree usage of MAP_EXECUTABLE". Stumbling over the history of MAP_EXECUTABLE, I noticed that we still have some in-tree users that we can get rid of. This patch (of 3): Before commit e9714acf8c43 ("mm: kill vma flag VM_EXECUTABLE and mm->num_exe_file_vmas"), VM_EXECUTABLE indicated MAP_EXECUTABLE. MAP_EXECUTABLE is nowadays essentially ignored by the kernel and does not relate to VM_MAYEXEC. Link: https://lkml.kernel.org/r/20210421093453.6904-1-david@redhat.com Link: https://lkml.kernel.org/r/20210421093453.6904-2-david@redhat.com Fixes: f972eb63b100 ("perf: Pass protection and flags bits through mmap2 interface") Signed-off-by: David Hildenbrand Cc: Peter Zijlstra Acked-by: "Eric W. Biederman" Reviewed-by: Kees Cook Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Alexander Viro Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Greg Ungerer Cc: Mike Rapoport Cc: Catalin Marinas Cc: Kevin Brodsky Cc: Michal Hocko Cc: Feng Tang Cc: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index fe88d6eea3c2..1c5e3240cdbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8301,8 +8301,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) if (vma->vm_flags & VM_DENYWRITE) flags |= MAP_DENYWRITE; - if (vma->vm_flags & VM_MAYEXEC) - flags |= MAP_EXECUTABLE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) -- cgit From a4eec6a3dfb7a6257ddcacf15e9428fe5834ffd4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 28 Jun 2021 19:38:31 -0700 Subject: binfmt: remove in-tree usage of MAP_EXECUTABLE Ever since commit e9714acf8c43 ("mm: kill vma flag VM_EXECUTABLE and mm->num_exe_file_vmas"), VM_EXECUTABLE is gone and MAP_EXECUTABLE is essentially completely ignored. Let's remove all usage of MAP_EXECUTABLE. [akpm@linux-foundation.org: fix blooper in fs/binfmt_aout.c. per David] Link: https://lkml.kernel.org/r/20210421093453.6904-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: "Eric W. Biederman" Reviewed-by: Kees Cook Cc: Alexander Shishkin Cc: Alexander Viro Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Catalin Marinas Cc: Don Zickus Cc: Feng Tang Cc: Greg Ungerer Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kevin Brodsky Cc: Mark Rutland Cc: Michal Hocko Cc: Mike Rapoport Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32_aout.c | 4 ++-- fs/binfmt_aout.c | 4 ++-- fs/binfmt_elf.c | 2 +- fs/binfmt_elf_fdpic.c | 11 ++--------- fs/binfmt_flat.c | 2 +- 5 files changed, 8 insertions(+), 15 deletions(-) diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index a09fc37ead9d..5e5b9fc2747f 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -203,7 +203,7 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | - MAP_EXECUTABLE | MAP_32BIT, + MAP_32BIT, fd_offset); if (error != N_TXTADDR(ex)) @@ -212,7 +212,7 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | - MAP_EXECUTABLE | MAP_32BIT, + MAP_32BIT, fd_offset + ex.a_text); if (error != N_DATADDR(ex)) return error; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 3e84e9bb9084..145917f734fe 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -222,7 +222,7 @@ static int load_aout_binary(struct linux_binprm * bprm) error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, fd_offset); if (error != N_TXTADDR(ex)) @@ -230,7 +230,7 @@ static int load_aout_binary(struct linux_binprm * bprm) error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, fd_offset + ex.a_text); if (error != N_DATADDR(ex)) return error; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 187b3f2b9202..baf8f91776f4 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1070,7 +1070,7 @@ out_free_interp: elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); - elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; + elf_flags = MAP_PRIVATE | MAP_DENYWRITE; vaddr = elf_ppnt->p_vaddr; /* diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 2c99b102c860..39fa1b0307e1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -928,7 +928,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( { struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; - unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; + unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0; int loop, ret; load_addr = params->load_addr; @@ -948,12 +948,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( } /* allocate one big anon block for everything */ - mflags = MAP_PRIVATE; - if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) - mflags |= MAP_EXECUTABLE; - maddr = vm_mmap(NULL, load_addr, top - base, - PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); + PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE, 0); if (IS_ERR_VALUE(maddr)) return (int) maddr; @@ -1046,9 +1042,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_flags & PF_X) prot |= PROT_EXEC; flags = MAP_PRIVATE | MAP_DENYWRITE; - if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) - flags |= MAP_EXECUTABLE; - maddr = 0; switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a1072c6a2341..5d776f80ee50 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -573,7 +573,7 @@ static int load_flat_file(struct linux_binprm *bprm, pr_debug("ROM mapping of file (we hope)\n"); textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, - MAP_PRIVATE|MAP_EXECUTABLE, 0); + MAP_PRIVATE, 0); if (!textpos || IS_ERR_VALUE(textpos)) { ret = textpos; if (!textpos) -- cgit From 3b8db39fad98cbb1d36e079236a446fad710daea Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 28 Jun 2021 19:38:35 -0700 Subject: mm: ignore MAP_EXECUTABLE in ksys_mmap_pgoff() Let's also remove masking off MAP_EXECUTABLE from ksys_mmap_pgoff(): the last in-tree occurrence of MAP_EXECUTABLE is now in LEGACY_MAP_MASK, which accepts the flag e.g., for MAP_SHARED_VALIDATE; however, the flag is ignored throughout the kernel now. Add a comment to LEGACY_MAP_MASK stating that MAP_EXECUTABLE is ignored. Link: https://lkml.kernel.org/r/20210421093453.6904-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: "Eric W. Biederman" Reviewed-by: Kees Cook Cc: Alexander Shishkin Cc: Alexander Viro Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Catalin Marinas Cc: Don Zickus Cc: Feng Tang Cc: Greg Ungerer Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kevin Brodsky Cc: Mark Rutland Cc: Michal Hocko Cc: Mike Rapoport Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mman.h | 2 ++ mm/mmap.c | 2 +- mm/nommu.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/mman.h b/include/linux/mman.h index 629cefc4ecba..ebb09a964272 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -31,6 +31,8 @@ /* * The historical set of flags that all mmap implementations implicitly * support when a ->mmap_validate() op is not provided in file_operations. + * + * MAP_EXECUTABLE is completely ignored throughout the kernel. */ #define LEGACY_MAP_MASK (MAP_SHARED \ | MAP_PRIVATE \ diff --git a/mm/mmap.c b/mm/mmap.c index 0584e540246e..f9a61f7dc540 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1633,7 +1633,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, return PTR_ERR(file); } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + flags &= ~MAP_DENYWRITE; retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: diff --git a/mm/nommu.c b/mm/nommu.c index 85a3a68dffb6..affda71641ca 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1296,7 +1296,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, goto out; } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + flags &= ~MAP_DENYWRITE; retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); -- cgit From 78d9cf6041b968cc881fd22e25e2662d1cec4dba Mon Sep 17 00:00:00 2001 From: Gonzalo Matias Juarez Tello Date: Mon, 28 Jun 2021 19:38:39 -0700 Subject: mm/mmap.c: logic of find_vma_intersection repeated in __do_munmap Logic of find_vma_intersection() is repeated in __do_munmap(). Also, prev is assigned a value before checking vma->vm_start >= end which might end up on a return statement making that assignment useless. Calling find_vma_intersection() checks that condition and returns NULL if no vma is found, hence only the !vma check is needed in __do_munmap(). Link: https://lkml.kernel.org/r/20210409162129.18313-1-gmjuareztello@gmail.com Signed-off-by: Gonzalo Matias Juarez Tello Reviewed-by: Andrew Morton Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index f9a61f7dc540..bb128a42557e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2828,16 +2828,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, */ arch_unmap(mm, start, end); - /* Find the first overlapping VMA */ - vma = find_vma(mm, start); + /* Find the first overlapping VMA where start < vma->vm_end */ + vma = find_vma_intersection(mm, start, end); if (!vma) return 0; prev = vma->vm_prev; - /* we have start < vma->vm_end */ - - /* if it doesn't overlap, we have nothing.. */ - if (vma->vm_start >= end) - return 0; /* * If we need to split any vma, do it now to save pain later. -- cgit From 96d990239e31d9623fdb96e829237b997c9d3d63 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:38:41 -0700 Subject: mm/mmap: introduce unlock_range() for code cleanup Both __do_munmap() and exit_mmap() unlock a range of VMAs using almost identical code blocks. Replace both blocks by a static inline function. [akpm@linux-foundation.org: tweak code layout] Link: https://lkml.kernel.org/r/20210510211021.2797427-1-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index bb128a42557e..d72716f7a0b4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2802,6 +2802,22 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } +static inline void +unlock_range(struct vm_area_struct *start, unsigned long limit) +{ + struct mm_struct *mm = start->vm_mm; + struct vm_area_struct *tmp = start; + + while (tmp && tmp->vm_start < limit) { + if (tmp->vm_flags & VM_LOCKED) { + mm->locked_vm -= vma_pages(tmp); + munlock_vma_pages_all(tmp); + } + + tmp = tmp->vm_next; + } +} + /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. @@ -2885,17 +2901,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, /* * unlock any mlock()ed ranges before detaching vmas */ - if (mm->locked_vm) { - struct vm_area_struct *tmp = vma; - while (tmp && tmp->vm_start < end) { - if (tmp->vm_flags & VM_LOCKED) { - mm->locked_vm -= vma_pages(tmp); - munlock_vma_pages_all(tmp); - } - - tmp = tmp->vm_next; - } - } + if (mm->locked_vm) + unlock_range(vma, end); /* Detach vmas from rbtree */ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) @@ -3180,14 +3187,8 @@ void exit_mmap(struct mm_struct *mm) mmap_write_unlock(mm); } - if (mm->locked_vm) { - vma = mm->mmap; - while (vma) { - if (vma->vm_flags & VM_LOCKED) - munlock_vma_pages_all(vma); - vma = vma->vm_next; - } - } + if (mm->locked_vm) + unlock_range(mm->mmap, ULONG_MAX); arch_exit_mmap(mm); -- cgit From 35e43c5ff4d2da700e8ed2216acae81f62800eaa Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:38:44 -0700 Subject: mm/mmap: use find_vma_intersection() in do_mmap() for overlap Using find_vma_intersection() avoids the need for a temporary variable and makes the code cleaner. Link: https://lkml.kernel.org/r/20210511014328.2902782-1-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index d72716f7a0b4..d8c92ae50565 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1457,9 +1457,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return addr; if (flags & MAP_FIXED_NOREPLACE) { - struct vm_area_struct *vma = find_vma(mm, addr); - - if (vma && vma->vm_start < addr + len) + if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } -- cgit From 2797e79f1a491fe4ffc4daf1104243ad07902d3f Mon Sep 17 00:00:00 2001 From: Liu Xiang Date: Mon, 28 Jun 2021 19:38:47 -0700 Subject: mm/memory.c: fix comment of finish_mkwrite_fault() Fix the return value in comment of finish_mkwrite_fault(). Link: https://lkml.kernel.org/r/20210513093931.15234-1-liu.xiang@zlingsmart.com Signed-off-by: Liu Xiang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index a4d82a6de000..b195ece205ef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3049,7 +3049,7 @@ oom: * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). * - * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before + * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) -- cgit From ce6d42f2e4a2d98898419743b037a95661e3ac9d Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:38:50 -0700 Subject: mm: add vma_lookup(), update find_vma_intersection() comments Patch series "mm: Add vma_lookup()", v2. Many places in the kernel use find_vma() to get a vma and then check the start address of the vma to ensure the next vma was not returned. Other places use the find_vma_intersection() call with add, addr + 1 as the range; looking for just the vma at a specific address. The third use of find_vma() is by developers who do not know that the function starts searching at the provided address upwards for the next vma. This results in a bug that is often overlooked for a long time. Adding the new vma_lookup() function will allow for cleaner code by removing the find_vma() calls which check limits, making find_vma_intersection() calls of a single address to be shorter, and potentially reduce the incorrect uses of find_vma(). This patch (of 22): Many places in the kernel use find_vma() to get a vma and then check the start address of the vma to ensure the next vma was not returned. Other places use the find_vma_intersection() call with add, addr + 1 as the range; looking for just the vma at a specific address. The third use of find_vma() is by developers who do not know that the function starts searching at the provided address upwards for the next vma. This results in a bug that is often overlooked for a long time. Adding the new vma_lookup() function will allow for cleaner code by removing the find_vma() calls which check limits, making find_vma_intersection() calls of a single address to be shorter, and potentially reduce the incorrect uses of find_vma(). Also change find_vma_intersection() comments and declaration to be of the correct length and add kernel documentation style comment. Link: https://lkml.kernel.org/r/20210521174745.2219620-1-Liam.Howlett@Oracle.com Link: https://lkml.kernel.org/r/20210521174745.2219620-2-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: David Miller Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 79f32962d7ae..1a98b5447a3b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2676,17 +2676,45 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +/** + * find_vma_intersection() - Look up the first VMA which intersects the interval + * @mm: The process address space. + * @start_addr: The inclusive start user address. + * @end_addr: The exclusive end user address. + * + * Returns: The first VMA within the provided range, %NULL otherwise. Assumes + * start_addr < end_addr. + */ +static inline +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) { - struct vm_area_struct * vma = find_vma(mm,start_addr); + struct vm_area_struct *vma = find_vma(mm, start_addr); if (vma && end_addr <= vma->vm_start) vma = NULL; return vma; } +/** + * vma_lookup() - Find a VMA at a specific address + * @mm: The process address space. + * @addr: The user address. + * + * Return: The vm_area_struct at the given address, %NULL otherwise. + */ +static inline +struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = find_vma(mm, addr); + + if (vma && addr < vma->vm_start) + vma = NULL; + + return vma; +} + static inline unsigned long vm_start_gap(struct vm_area_struct *vma) { unsigned long vm_start = vma->vm_start; -- cgit From 064b2663603c76e9ab6fe1bb2e92d1a7299fff9e Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:38:53 -0700 Subject: drm/i915/selftests: use vma_lookup() in __igt_mmap() vma_lookup() will look up the vma at a specific address. find_vma() will start the search for a specific address and continue upwards. This fixes an issue with the selftest as the returned vma may not be the newly created vma, but simply the vma at a higher address. objects Link: https://lkml.kernel.org/r/20210521174745.2219620-3-Liam.Howlett@Oracle.com Fixes: 6fedafacae1b (drm/i915/selftests: Wrap vm_mmap() around GEM Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c index 5cf6df49c333..35c15ef1327d 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c @@ -871,7 +871,7 @@ static int __igt_mmap(struct drm_i915_private *i915, pr_debug("igt_mmap(%s, %d) @ %lx\n", obj->mm.region->name, type, addr); - area = find_vma(current->mm, addr); + area = vma_lookup(current->mm, addr); if (!area) { pr_err("%s: Did not create a vm_area_struct for the mmap\n", obj->mm.region->name); -- cgit From b55541414bd00dbf64cf2ff4c4f1c41cd5cd42dc Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:38:56 -0700 Subject: arch/arc/kernel/troubleshoot: use vma_lookup() instead of find_vma() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-4-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/kernel/troubleshoot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index a331bb5d8319..7654c2e42dc0 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -83,12 +83,12 @@ static void show_faulting_vma(unsigned long address) * non-inclusive vma */ mmap_read_lock(active_mm); - vma = find_vma(active_mm, address); + vma = vma_lookup(active_mm, address); - /* check against the find_vma( ) behaviour which returns the next VMA - * if the container VMA is not found + /* Lookup the vma at the address and report if the container VMA is not + * found */ - if (vma && (vma->vm_start <= address)) { + if (vma) { char buf[ARC_PATH_MAX]; char *nm = "?"; -- cgit From 09eef83a801512a71b0c95c25e7d8fd69141aa1b Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:38:59 -0700 Subject: arch/arm64/kvm: use vma_lookup() instead of find_vma_intersection() vma_lookup() finds the vma of a specific address with a cleaner interface and is more readable. Link: https://lkml.kernel.org/r/20210521174745.2219620-5-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c10207fed2f3..74b3c1a3ff5a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -855,7 +855,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Let's check if we will get back a huge page backed by hugetlbfs */ mmap_read_lock(current->mm); - vma = find_vma_intersection(current->mm, hva, hva + 1); + vma = vma_lookup(current->mm, hva); if (unlikely(!vma)) { kvm_err("Failed to find VMA for hva 0x%lx\n", hva); mmap_read_unlock(current->mm); -- cgit From 27a14d287e16c308040508be9f0cb28bc935bd0e Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:02 -0700 Subject: arch/powerpc/kvm/book3s_hv_uvmem: use vma_lookup() instead of find_vma_intersection() vma_lookup() finds the vma of a specific address with a cleaner interface and is more readable. Link: https://lkml.kernel.org/r/20210521174745.2219620-6-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 84e5a2dc8be5..34720b79588f 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -614,7 +614,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, /* Fetch the VMA if addr is not in the latest fetched one */ if (!vma || addr >= vma->vm_end) { - vma = find_vma_intersection(kvm->mm, addr, addr+1); + vma = vma_lookup(kvm->mm, addr); if (!vma) { pr_err("Can't find VMA for gfn:0x%lx\n", gfn); break; -- cgit From 900c83f88af06bf0466c28bdde8a2c011b855e44 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:05 -0700 Subject: arch/powerpc/kvm/book3s: use vma_lookup() in kvmppc_hv_setup_htab_rma() Using vma_lookup() removes the requirement to check if the address is within the returned vma. The code is easier to understand and more compact. Link: https://lkml.kernel.org/r/20210521174745.2219620-7-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_hv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bc0813644666..fb83c84d116a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4758,8 +4758,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Look up the VMA for the start of this memory slot */ hva = memslot->userspace_addr; mmap_read_lock(kvm->mm); - vma = find_vma(kvm->mm, hva); - if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) + vma = vma_lookup(kvm->mm, hva); + if (!vma || (vma->vm_flags & VM_IO)) goto up_out; psize = vma_kernel_pagesize(vma); -- cgit From 7f7020ac0dc9a7a7dfb2237c6e8ed13f40162a6b Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:08 -0700 Subject: arch/mips/kernel/traps: use vma_lookup() instead of find_vma() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-8-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/traps.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 0b4e06303c55..6f07362de5ce 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -784,7 +784,6 @@ void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr, int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31) { int si_code; - struct vm_area_struct *vma; switch (sig) { case 0: @@ -800,8 +799,7 @@ int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31) case SIGSEGV: mmap_read_lock(current->mm); - vma = find_vma(current->mm, (unsigned long)fault_addr); - if (vma && (vma->vm_start <= (unsigned long)fault_addr)) + if (vma_lookup(current->mm, (unsigned long)fault_addr)) si_code = SEGV_ACCERR; else si_code = SEGV_MAPERR; -- cgit From 3b93e042a59dd8c5b252eb0934f195f169ce68f9 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:11 -0700 Subject: arch/m68k/kernel/sys_m68k: use vma_lookup() in sys_cacheflush() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-9-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Acked-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m68k/kernel/sys_m68k.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index f55bdcb8e4f1..bd0274c7592e 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c @@ -402,8 +402,8 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) * to this process. */ mmap_read_lock(current->mm); - vma = find_vma(current->mm, addr); - if (!vma || addr < vma->vm_start || addr + len > vma->vm_end) + vma = vma_lookup(current->mm, addr); + if (!vma || addr + len > vma->vm_end) goto out_unlock; } -- cgit From 9ce2c3fc0be6e7d0bb2236a33bbb7a0f1943bd81 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:14 -0700 Subject: x86/sgx: use vma_lookup() in sgx_encl_find() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-10-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/sgx/encl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 6e74f85b6264..fec43ca65065 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *result; - result = find_vma(mm, addr); - if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start) + result = vma_lookup(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops) return -EINVAL; *vma = result; -- cgit From fc98c03ba9ea970c6b346a6fe57f98c16a3971da Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:17 -0700 Subject: virt/kvm: use vma_lookup() instead of find_vma_intersection() vma_lookup() finds the vma of a specific address with a cleaner interface and is more readable. Link: https://lkml.kernel.org/r/20210521174745.2219620-11-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 46fb042837d2..732bfaf252bd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2170,7 +2170,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, } retry: - vma = find_vma_intersection(current->mm, addr, addr + 1); + vma = vma_lookup(current->mm, addr); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; -- cgit From 85715d6809014870a8a4d498b292fc5711a969e7 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:20 -0700 Subject: vfio: use vma_lookup() instead of find_vma_intersection() vma_lookup() finds the vma of a specific address with a cleaner interface and is more readable. Link: https://lkml.kernel.org/r/20210521174745.2219620-12-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/vfio/vfio_iommu_type1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index a3e925a41b0d..4fce73a8a650 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -567,7 +567,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, vaddr = untagged_addr(vaddr); retry: - vma = find_vma_intersection(mm, vaddr, vaddr + 1); + vma = vma_lookup(mm, vaddr); if (vma && vma->vm_flags & VM_PFNMAP) { ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE); -- cgit From 47bdd1db16e67ebfde6f77eaf7625b2292ae6d58 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:23 -0700 Subject: net/ipv5/tcp: use vma_lookup() in tcp_zerocopy_receive() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-13-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: David Miller Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1c1f9e3de72..64bf179cc915 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2095,8 +2095,8 @@ static int tcp_zerocopy_receive(struct sock *sk, mmap_read_lock(current->mm); - vma = find_vma(current->mm, address); - if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) { + vma = vma_lookup(current->mm, address); + if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(current->mm); return -EINVAL; } -- cgit From da68547d3692e89984f2c952c0931aa27b9095cd Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:26 -0700 Subject: drm/amdgpu: use vma_lookup() in amdgpu_ttm_tt_get_user_pages() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-14-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Acked-by: Alex Deucher Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index d5cbc51c5eaa..61c4fb1b87fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -709,8 +709,8 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) } mmap_read_lock(mm); - vma = find_vma(mm, start); - if (unlikely(!vma || start < vma->vm_start)) { + vma = vma_lookup(mm, start); + if (unlikely(!vma)) { r = -EFAULT; goto out_unlock; } -- cgit From 49be780f798446ea86aa6cd687f9e51cbe569149 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:29 -0700 Subject: media: videobuf2: use vma_lookup() in get_vaddr_frames() vma_lookup() finds the vma of a specific address with a cleaner interface and is more readable. Link: https://lkml.kernel.org/r/20210521174745.2219620-15-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/media/common/videobuf2/frame_vector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/common/videobuf2/frame_vector.c b/drivers/media/common/videobuf2/frame_vector.c index 381158320a90..ce879f6f8f82 100644 --- a/drivers/media/common/videobuf2/frame_vector.c +++ b/drivers/media/common/videobuf2/frame_vector.c @@ -64,7 +64,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, do { unsigned long *nums = frame_vector_pfns(vec); - vma = find_vma_intersection(mm, start, start + 1); + vma = vma_lookup(mm, start); if (!vma) break; -- cgit From 2beaf153e1d041e0a61e3aae618294f4a037055d Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:32 -0700 Subject: misc/sgi-gru/grufault: use vma_lookup() in gru_find_vma() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-16-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/sgi-gru/grufault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index 723825524ea0..d7ef61e602ed 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -49,8 +49,8 @@ struct vm_area_struct *gru_find_vma(unsigned long vaddr) { struct vm_area_struct *vma; - vma = find_vma(current->mm, vaddr); - if (vma && vma->vm_start <= vaddr && vma->vm_ops == &gru_vm_ops) + vma = vma_lookup(current->mm, vaddr); + if (vma && vma->vm_ops == &gru_vm_ops) return vma; return NULL; } -- cgit From 9016ddeddf8510f79b4c5816855cdd244e84ad7f Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:35 -0700 Subject: kernel/events/uprobes: use vma_lookup() in find_active_uprobe() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-17-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6addc9780319..907d4ee00cb2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2046,8 +2046,8 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) struct vm_area_struct *vma; mmap_read_lock(mm); - vma = find_vma(mm, bp_vaddr); - if (vma && vma->vm_start <= bp_vaddr) { + vma = vma_lookup(mm, bp_vaddr); + if (vma) { if (valid_vma(vma, false)) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); -- cgit From 46e6b31d4617612e47daeb7b4b6350b116349f6d Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:38 -0700 Subject: lib/test_hmm: use vma_lookup() in dmirror_migrate() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-18-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_hmm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 80a78877bd93..15f2e2db77bc 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -686,9 +686,8 @@ static int dmirror_migrate(struct dmirror *dmirror, mmap_read_lock(mm); for (addr = start; addr < end; addr = next) { - vma = find_vma(mm, addr); - if (!vma || addr < vma->vm_start || - !(vma->vm_flags & VM_READ)) { + vma = vma_lookup(mm, addr); + if (!vma || !(vma->vm_flags & VM_READ)) { ret = -EINVAL; goto out; } -- cgit From ff69fb8100f18151f838c1e07368bbc98b437e6a Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:41 -0700 Subject: mm/ksm: use vma_lookup() in find_mergeable_vma() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-19-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 2f3aaeb34a42..3fa9bc8a67cf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -521,10 +521,8 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, struct vm_area_struct *vma; if (ksm_test_exit(mm)) return NULL; - vma = find_vma(mm, addr); - if (!vma || vma->vm_start > addr) - return NULL; - if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + vma = vma_lookup(mm, addr); + if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) return NULL; return vma; } -- cgit From 059b8b4875b3c046770e4f9fb553ece40b217b40 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:44 -0700 Subject: mm/migrate: use vma_lookup() in do_pages_stat_array() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-20-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 41ff2c9896c4..380ca57b9031 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1834,8 +1834,8 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, struct page *page; int err = -EFAULT; - vma = find_vma(mm, addr); - if (!vma || addr < vma->vm_start) + vma = vma_lookup(mm, addr); + if (!vma) goto set_status; /* FOLL_DUMP to ignore special (like zero) pages */ -- cgit From 5aaf07f0812adef788f9f08a73914148b5fdd40e Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:47 -0700 Subject: mm/mremap: use vma_lookup() in vma_to_resize() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-21-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 47c255b60150..a369a6100698 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -634,10 +634,11 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, unsigned long *p) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = find_vma(mm, addr); + struct vm_area_struct *vma; unsigned long pgoff; - if (!vma || vma->vm_start > addr) + vma = vma_lookup(mm, addr); + if (!vma) return ERR_PTR(-EFAULT); /* -- cgit From 3e418f9888463a80d559498a523e582b59e5ff2d Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:50 -0700 Subject: mm/memory.c: use vma_lookup() in __access_remote_vm() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-22-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index b195ece205ef..3dd6b2e73e1d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4994,8 +4994,8 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. */ - vma = find_vma(mm, addr); - if (!vma || vma->vm_start > addr) + vma = vma_lookup(mm, addr); + if (!vma) break; if (vma->vm_ops && vma->vm_ops->access) ret = vma->vm_ops->access(vma, addr, buf, -- cgit From 33e3575c5148c9874122d9a5062d58fc570f5ee6 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:53 -0700 Subject: mm/mempolicy: use vma_lookup() in __access_remote_vm() vma_lookup() finds the vma of a specific address with a cleaner interface and is more readable. Link: https://lkml.kernel.org/r/20210521174745.2219620-23-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d79fa299b70c..325771bef5e2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -975,7 +975,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); - vma = find_vma_intersection(mm, addr, addr+1); + vma = vma_lookup(mm, addr); if (!vma) { mmap_read_unlock(mm); return -EFAULT; -- cgit From 5673a60b80e8d1eaaa1e800e8a85451fd037f63e Mon Sep 17 00:00:00 2001 From: Chen Li Date: Mon, 28 Jun 2021 19:39:56 -0700 Subject: mm: update legacy flush_tlb_* to use vma 1. These tlb flush functions have been using vma instead mm long time ago, but there is still some comments use mm as parameter. 2. the actual struct we use is vm_area_struct instead of vma_struct. 3. remove unused flush_kern_tlb_page. Link: https://lkml.kernel.org/r/87k0oaq311.wl-chenli@uniontech.com Signed-off-by: Chen Li Acked-by: Geert Uytterhoeven Cc: Russell King Cc: Jonas Bonn Cc: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlbflush.h | 13 +++---------- arch/arm/mm/tlb-v6.S | 2 +- arch/arm/mm/tlb-v7.S | 2 +- arch/ia64/kernel/efi_stub.S | 2 +- arch/m68k/include/asm/tlbflush.h | 2 +- arch/openrisc/include/asm/tlbflush.h | 2 +- arch/xtensa/include/asm/tlbflush.h | 4 ++-- 7 files changed, 10 insertions(+), 17 deletions(-) diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h index 24cbfc112dfa..0ccc985b90af 100644 --- a/arch/arm/include/asm/tlbflush.h +++ b/arch/arm/include/asm/tlbflush.h @@ -253,7 +253,7 @@ extern struct cpu_tlb_fns cpu_tlb; * space. * - mm - mm_struct describing address space * - * flush_tlb_range(mm,start,end) + * flush_tlb_range(vma,start,end) * * Invalidate a range of TLB entries in the specified * address space. @@ -261,18 +261,11 @@ extern struct cpu_tlb_fns cpu_tlb; * - start - start address (may not be aligned) * - end - end address (exclusive, may not be aligned) * - * flush_tlb_page(vaddr,vma) + * flush_tlb_page(vma, uaddr) * * Invalidate the specified page in the specified address range. + * - vma - vm_area_struct describing address range * - vaddr - virtual address (may not be aligned) - * - vma - vma_struct describing address range - * - * flush_kern_tlb_page(kaddr) - * - * Invalidate the TLB entry for the specified page. The address - * will be in the kernels virtual memory space. Current uses - * only require the D-TLB to be invalidated. - * - kaddr - Kernel virtual memory address */ /* diff --git a/arch/arm/mm/tlb-v6.S b/arch/arm/mm/tlb-v6.S index 5335b9687297..74f4b383afe3 100644 --- a/arch/arm/mm/tlb-v6.S +++ b/arch/arm/mm/tlb-v6.S @@ -24,7 +24,7 @@ * * - start - start address (may not be aligned) * - end - end address (exclusive, may not be aligned) - * - vma - vma_struct describing address range + * - vma - vm_area_struct describing address range * * It is assumed that: * - the "Invalidate single entry" instruction will invalidate diff --git a/arch/arm/mm/tlb-v7.S b/arch/arm/mm/tlb-v7.S index 1bb28d7db567..87bf4ab17721 100644 --- a/arch/arm/mm/tlb-v7.S +++ b/arch/arm/mm/tlb-v7.S @@ -23,7 +23,7 @@ * * - start - start address (may not be aligned) * - end - end address (exclusive, may not be aligned) - * - vma - vma_struct describing address range + * - vma - vm_area_struct describing address range * * It is assumed that: * - the "Invalidate single entry" instruction will invalidate diff --git a/arch/ia64/kernel/efi_stub.S b/arch/ia64/kernel/efi_stub.S index 58233bb7976d..1fd61b78fb29 100644 --- a/arch/ia64/kernel/efi_stub.S +++ b/arch/ia64/kernel/efi_stub.S @@ -7,7 +7,7 @@ * * This stub allows us to make EFI calls in physical mode with interrupts * turned off. We need this because we can't call SetVirtualMap() until - * the kernel has booted far enough to allow allocation of struct vma_struct + * the kernel has booted far enough to allow allocation of struct vm_area_struct * entries (which we would need to map stuff with memory attributes other * than uncached or writeback...). Since the GetTime() service gets called * earlier than that, we need to be able to make physical mode EFI calls from diff --git a/arch/m68k/include/asm/tlbflush.h b/arch/m68k/include/asm/tlbflush.h index 5337bc2c262f..a6318ccd308f 100644 --- a/arch/m68k/include/asm/tlbflush.h +++ b/arch/m68k/include/asm/tlbflush.h @@ -263,7 +263,7 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr BUG(); } -static inline void flush_tlb_range(struct mm_struct *mm, +static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { BUG(); diff --git a/arch/openrisc/include/asm/tlbflush.h b/arch/openrisc/include/asm/tlbflush.h index 185dcd3731ed..dbf030365ab4 100644 --- a/arch/openrisc/include/asm/tlbflush.h +++ b/arch/openrisc/include/asm/tlbflush.h @@ -25,7 +25,7 @@ * - flush_tlb_all() flushes all processes TLBs * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(mm, start, end) flushes a range of pages + * - flush_tlb_range(vma, start, end) flushes a range of pages */ extern void local_flush_tlb_all(void); extern void local_flush_tlb_mm(struct mm_struct *mm); diff --git a/arch/xtensa/include/asm/tlbflush.h b/arch/xtensa/include/asm/tlbflush.h index 856e2da2e397..573df8cea200 100644 --- a/arch/xtensa/include/asm/tlbflush.h +++ b/arch/xtensa/include/asm/tlbflush.h @@ -26,8 +26,8 @@ * * - flush_tlb_all() flushes all processes TLB entries * - flush_tlb_mm(mm) flushes the specified mm context TLB entries - * - flush_tlb_page(mm, vmaddr) flushes a single page - * - flush_tlb_range(mm, start, end) flushes a range of pages + * - flush_tlb_page(vma, page) flushes a single page + * - flush_tlb_range(vma, vmaddr, end) flushes a range of pages */ void local_flush_tlb_all(void); -- cgit From f4c1ab0937c3a22c5e6e735b47fa3fa9c68dc26e Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 28 Jun 2021 19:40:02 -0700 Subject: h8300: remove unused variable Kernel test robot throws below warning -> >> arch/h8300/kernel/setup.c:72:26: warning: Unused variable: region [unusedVariable] struct memblock_region *region; Fixed it by removing unused variable. Link: https://lkml.kernel.org/r/20210602185431.11416-1-jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Reported-by: kernel test robot Acked-by: Mike Rapoport Acked-by: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/h8300/kernel/setup.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c index 0281f92eea3d..c3590b2e9592 100644 --- a/arch/h8300/kernel/setup.c +++ b/arch/h8300/kernel/setup.c @@ -69,8 +69,6 @@ void __init h8300_fdt_init(void *fdt, char *bootargs) static void __init bootmem_init(void) { - struct memblock_region *region; - memory_end = memory_start = 0; /* Find main memory where is the kernel */ -- cgit From e8df2c703d5d1a99cfc45124bfa6f5e1982e0166 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 28 Jun 2021 19:40:05 -0700 Subject: mm/dmapool: use DEVICE_ATTR_RO macro Use DEVICE_ATTR_RO() helper instead of plain DEVICE_ATTR(), which makes the code a bit shorter and easier to read. Link: https://lkml.kernel.org/r/20210524112852.34716-1-yuehaibing@huawei.com Signed-off-by: YueHaibing Reviewed-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 16483f86360e..64b537b3ccb0 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -62,8 +62,7 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ static DEFINE_MUTEX(pools_lock); static DEFINE_MUTEX(pools_reg_lock); -static ssize_t -show_pools(struct device *dev, struct device_attribute *attr, char *buf) +static ssize_t pools_show(struct device *dev, struct device_attribute *attr, char *buf) { unsigned temp; unsigned size; @@ -103,7 +102,7 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf) return PAGE_SIZE - size; } -static DEVICE_ATTR(pools, 0444, show_pools, NULL); +static DEVICE_ATTR_RO(pools); /** * dma_pool_create - Creates a pool of consistent memory blocks, for dma. -- cgit From 53d884a6675b0fd7bc8c7b4afd6ead6f17bc4c61 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Mon, 28 Jun 2021 19:40:08 -0700 Subject: mm, tracing: unify PFN format strings Some trace event formats print PFNs as hex while others print them as decimal. This is rather annoying when attempting to grep through traces to understand what's going on with a particular page. $ git grep -ho 'pfn=[0x%lu]\+' include/trace/events/ | sort | uniq -c 11 pfn=0x%lx 12 pfn=%lu 2 pfn=%lx Printing as hex is in the majority in the trace events, and all the normal printks in mm/ also print PFNs as hex, so change all the PFN formats in the trace events to use 0x%lx. Link: https://lkml.kernel.org/r/20210602092608.1493-1-vincent.whitchurch@axis.com Signed-off-by: Vincent Whitchurch Cc: Steven Rostedt Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Ilias Apalodimas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/cma.h | 4 ++-- include/trace/events/filemap.h | 2 +- include/trace/events/kmem.h | 12 ++++++------ include/trace/events/page_pool.h | 4 ++-- include/trace/events/pagemap.h | 4 ++-- include/trace/events/vmscan.h | 2 +- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index c3d354702cb0..3d708dae1542 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -31,7 +31,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __entry->align = align; ), - TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u", + TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u", __get_str(name), __entry->pfn, __entry->page, @@ -60,7 +60,7 @@ TRACE_EVENT(cma_release, __entry->count = count; ), - TP_printk("name=%s pfn=%lx page=%p count=%lu", + TP_printk("name=%s pfn=0x%lx page=%p count=%lu", __get_str(name), __entry->pfn, __entry->page, diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h index 796053e162d2..c47b63db124e 100644 --- a/include/trace/events/filemap.h +++ b/include/trace/events/filemap.h @@ -36,7 +36,7 @@ DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, __entry->s_dev = page->mapping->host->i_rdev; ), - TP_printk("dev %d:%d ino %lx page=%p pfn=%lu ofs=%lu", + TP_printk("dev %d:%d ino %lx page=%p pfn=0x%lx ofs=%lu", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, pfn_to_page(__entry->pfn), diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 829a75692cc0..ddc8c944f417 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -173,7 +173,7 @@ TRACE_EVENT(mm_page_free, __entry->order = order; ), - TP_printk("page=%p pfn=%lu order=%d", + TP_printk("page=%p pfn=0x%lx order=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->order) @@ -193,7 +193,7 @@ TRACE_EVENT(mm_page_free_batched, __entry->pfn = page_to_pfn(page); ), - TP_printk("page=%p pfn=%lu order=0", + TP_printk("page=%p pfn=0x%lx order=0", pfn_to_page(__entry->pfn), __entry->pfn) ); @@ -219,7 +219,7 @@ TRACE_EVENT(mm_page_alloc, __entry->migratetype = migratetype; ), - TP_printk("page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s", + TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s", __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, __entry->pfn != -1UL ? __entry->pfn : 0, __entry->order, @@ -245,7 +245,7 @@ DECLARE_EVENT_CLASS(mm_page, __entry->migratetype = migratetype; ), - TP_printk("page=%p pfn=%lu order=%u migratetype=%d percpu_refill=%d", + TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d", __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, __entry->pfn != -1UL ? __entry->pfn : 0, __entry->order, @@ -278,7 +278,7 @@ TRACE_EVENT(mm_page_pcpu_drain, __entry->migratetype = migratetype; ), - TP_printk("page=%p pfn=%lu order=%d migratetype=%d", + TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->order, __entry->migratetype) ); @@ -312,7 +312,7 @@ TRACE_EVENT(mm_page_alloc_extfrag, get_pageblock_migratetype(page)); ), - TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", + TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", pfn_to_page(__entry->pfn), __entry->pfn, __entry->alloc_order, diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h index ad0aa7f31675..ca534501158b 100644 --- a/include/trace/events/page_pool.h +++ b/include/trace/events/page_pool.h @@ -60,7 +60,7 @@ TRACE_EVENT(page_pool_state_release, __entry->pfn = page_to_pfn(page); ), - TP_printk("page_pool=%p page=%p pfn=%lu release=%u", + TP_printk("page_pool=%p page=%p pfn=0x%lx release=%u", __entry->pool, __entry->page, __entry->pfn, __entry->release) ); @@ -85,7 +85,7 @@ TRACE_EVENT(page_pool_state_hold, __entry->pfn = page_to_pfn(page); ), - TP_printk("page_pool=%p page=%p pfn=%lu hold=%u", + TP_printk("page_pool=%p page=%p pfn=0x%lx hold=%u", __entry->pool, __entry->page, __entry->pfn, __entry->hold) ); diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h index e1735fe7c76a..1d28431e85bd 100644 --- a/include/trace/events/pagemap.h +++ b/include/trace/events/pagemap.h @@ -46,7 +46,7 @@ TRACE_EVENT(mm_lru_insertion, ), /* Flag format is based on page-types.c formatting for pagemap */ - TP_printk("page=%p pfn=%lu lru=%d flags=%s%s%s%s%s%s", + TP_printk("page=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", __entry->page, __entry->pfn, __entry->lru, @@ -75,7 +75,7 @@ TRACE_EVENT(mm_lru_activate, ), /* Flag format is based on page-types.c formatting for pagemap */ - TP_printk("page=%p pfn=%lu", __entry->page, __entry->pfn) + TP_printk("page=%p pfn=0x%lx", __entry->page, __entry->pfn) ); diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 2070df64958e..00d1180527d8 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -330,7 +330,7 @@ TRACE_EVENT(mm_vmscan_writepage, page_is_file_lru(page)); ), - TP_printk("page=%p pfn=%lu flags=%s", + TP_printk("page=%p pfn=0x%lx flags=%s", pfn_to_page(__entry->pfn), __entry->pfn, show_reclaim_flags(__entry->reclaim_flags)) -- cgit From a2afc59fb25027749bd41c44f47382522232019e Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 28 Jun 2021 19:40:11 -0700 Subject: mm/page_alloc: add an alloc_pages_bulk_array_node() helper Patch series "vmalloc() vs bulk allocator", v2. This patch (of 3): Add a "node" variant of the alloc_pages_bulk_array() function. The helper guarantees that a __alloc_pages_bulk() is invoked with a valid NUMA node ID. Link: https://lkml.kernel.org/r/20210516202056.2120-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20210516202056.2120-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Mel Gorman Cc: Mel Gorman Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Hillf Danton Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 11da8af06704..94f0b8b1cb55 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -536,6 +536,15 @@ alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_arr return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); } +static inline unsigned long +alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) +{ + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + + return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array); +} + /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). -- cgit From 5c1f4e690eecc795b2e4d4408e87302040fceca4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 28 Jun 2021 19:40:14 -0700 Subject: mm/vmalloc: switch to bulk allocator in __vmalloc_area_node() Recently there has been introduced a page bulk allocator for users which need to get number of pages per one call request. For order-0 pages switch to an alloc_pages_bulk_array_node() instead of alloc_pages_node(), the reason is the former is not capable of allocating set of pages, thus a one call is per one page. Second, according to my tests the bulk allocator uses less cycles even for scenarios when only one page is requested. Running the "perf" on same test case shows below difference: - 45.18% __vmalloc_node - __vmalloc_node_range - 35.60% __alloc_pages - get_page_from_freelist 3.36% __list_del_entry_valid 3.00% check_preemption_disabled 1.42% prep_new_page - 31.00% __vmalloc_node - __vmalloc_node_range - 14.48% __alloc_pages_bulk 3.22% __list_del_entry_valid - 0.83% __alloc_pages get_page_from_freelist The "test_vmalloc.sh" also shows performance improvements: fix_size_alloc_test_4MB loops: 1000000 avg: 89105095 usec fix_size_alloc_test loops: 1000000 avg: 513672 usec full_fit_alloc_test loops: 1000000 avg: 748900 usec long_busy_list_alloc_test loops: 1000000 avg: 8043038 usec random_size_alloc_test loops: 1000000 avg: 4028582 usec fix_align_alloc_test loops: 1000000 avg: 1457671 usec fix_size_alloc_test_4MB loops: 1000000 avg: 62083711 usec fix_size_alloc_test loops: 1000000 avg: 449207 usec full_fit_alloc_test loops: 1000000 avg: 735985 usec long_busy_list_alloc_test loops: 1000000 avg: 5176052 usec random_size_alloc_test loops: 1000000 avg: 2589252 usec fix_align_alloc_test loops: 1000000 avg: 1365009 usec For example 4MB allocations illustrates ~30% gain, all the rest is also better. Link: https://lkml.kernel.org/r/20210516202056.2120-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Mel Gorman Cc: Hillf Danton Cc: Matthew Wilcox Cc: Michal Hocko Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 76 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d0a7d89be091..e630f2cf7900 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2768,8 +2768,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; - struct page **pages; - unsigned int i; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; @@ -2778,13 +2776,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, nested_gfp, node, + area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); } else { - pages = kmalloc_node(array_size, nested_gfp, node); + area->pages = kmalloc_node(array_size, nested_gfp, node); } - if (!pages) { + if (!area->pages) { free_vm_area(area); warn_alloc(gfp_mask, NULL, "vmalloc size %lu allocation failure: " @@ -2793,43 +2791,53 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return NULL; } - area->pages = pages; - area->nr_pages = nr_small_pages; + area->nr_pages = 0; set_vm_area_page_order(area, page_shift - PAGE_SHIFT); - page_order = vm_area_page_order(area); - /* - * Careful, we allocate and map page_order pages, but tracking is done - * per PAGE_SIZE page so as to keep the vm_struct APIs independent of - * the physical/mapped size. - */ - for (i = 0; i < area->nr_pages; i += 1U << page_order) { - struct page *page; - int p; - - /* Compound pages required for remap_vmalloc_page */ - page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); - if (unlikely(!page)) { - /* Successfully allocated i pages, free them in __vfree() */ - area->nr_pages = i; - atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - warn_alloc(gfp_mask, NULL, - "vmalloc size %lu allocation failure: " - "page order %u allocation failed", - area->nr_pages * PAGE_SIZE, page_order); - goto fail; - } + if (!page_order) { + area->nr_pages = alloc_pages_bulk_array_node( + gfp_mask, node, nr_small_pages, area->pages); + } else { + /* + * Careful, we allocate and map page_order pages, but tracking is done + * per PAGE_SIZE page so as to keep the vm_struct APIs independent of + * the physical/mapped size. + */ + while (area->nr_pages < nr_small_pages) { + struct page *page; + int i; + + /* Compound pages required for remap_vmalloc_page */ + page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); + if (unlikely(!page)) + break; - for (p = 0; p < (1U << page_order); p++) - area->pages[i + p] = page + p; + for (i = 0; i < (1U << page_order); i++) + area->pages[area->nr_pages + i] = page + i; - if (gfpflags_allow_blocking(gfp_mask)) - cond_resched(); + if (gfpflags_allow_blocking(gfp_mask)) + cond_resched(); + + area->nr_pages += 1U << page_order; + } } + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) { + /* + * If not enough pages were obtained to accomplish an + * allocation request, free them via __vfree() if any. + */ + if (area->nr_pages != nr_small_pages) { + warn_alloc(gfp_mask, NULL, + "vmalloc size %lu allocation failure: " + "page order %u allocation failed", + area->nr_pages * PAGE_SIZE, page_order); + goto fail; + } + + if (vmap_pages_range(addr, addr + size, prot, area->pages, page_shift) < 0) { warn_alloc(gfp_mask, NULL, "vmalloc size %lu allocation failure: " "failed to map pages", -- cgit From cd61413baa1052fc13e75dd092a0e23ac29a0205 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 28 Jun 2021 19:40:17 -0700 Subject: mm/vmalloc: print a warning message first on failure When a memory allocation for array of pages are not succeed emit a warning message as a first step and then perform the further cleanup. The reason it should be done in a right order is the clean up function which is free_vm_area() can potentially also follow its error paths what can lead to confusion what was broken first. Link: https://lkml.kernel.org/r/20210516202056.2120-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Hillf Danton Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e630f2cf7900..74a31abaa52c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2783,11 +2783,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (!area->pages) { - free_vm_area(area); warn_alloc(gfp_mask, NULL, "vmalloc size %lu allocation failure: " "page array size %lu allocation failed", nr_small_pages * PAGE_SIZE, array_size); + free_vm_area(area); return NULL; } -- cgit From f4bdfeaf18a44b4d0bca945ace272cbf5e91a1b3 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Mon, 28 Jun 2021 19:40:20 -0700 Subject: mm/vmalloc: remove quoted strings split across lines A checkpatch.pl script complains on splitting a text across lines. It is because if a user wants to find an entire string he or she will not succeeded. WARNING: quoted string split across lines + "vmalloc size %lu allocation failure: " + "page order %u allocation failed", total: 0 errors, 1 warnings, 10 lines checked Link: https://lkml.kernel.org/r/20210521204359.19943-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Mel Gorman Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Hillf Danton Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 74a31abaa52c..ed0a32ea09ee 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2784,9 +2784,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (!area->pages) { warn_alloc(gfp_mask, NULL, - "vmalloc size %lu allocation failure: " - "page array size %lu allocation failed", - nr_small_pages * PAGE_SIZE, array_size); + "vmalloc error: size %lu, failed to allocated page array size %lu", + nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); return NULL; } @@ -2831,17 +2830,15 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, */ if (area->nr_pages != nr_small_pages) { warn_alloc(gfp_mask, NULL, - "vmalloc size %lu allocation failure: " - "page order %u allocation failed", + "vmalloc error: size %lu, page order %u, failed to allocate pages", area->nr_pages * PAGE_SIZE, page_order); goto fail; } if (vmap_pages_range(addr, addr + size, prot, area->pages, page_shift) < 0) { warn_alloc(gfp_mask, NULL, - "vmalloc size %lu allocation failure: " - "failed to map pages", - area->nr_pages * PAGE_SIZE); + "vmalloc error: size %lu, failed to map pages", + area->nr_pages * PAGE_SIZE); goto fail; } @@ -2886,8 +2883,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if ((size >> PAGE_SHIFT) > totalram_pages()) { warn_alloc(gfp_mask, NULL, - "vmalloc size %lu allocation failure: " - "exceeds total pages", real_size); + "vmalloc error: size %lu, exceeds total pages", + real_size); return NULL; } @@ -2918,8 +2915,8 @@ again: gfp_mask, caller); if (!area) { warn_alloc(gfp_mask, NULL, - "vmalloc size %lu allocation failure: " - "vm_struct allocation failed", real_size); + "vmalloc error: size %lu, vm_struct allocation failed", + real_size); goto fail; } -- cgit From 12b9f873a5d0e6b3846835ec973bbafa338d0b5a Mon Sep 17 00:00:00 2001 From: Uladzislau Rezki Date: Mon, 28 Jun 2021 19:40:23 -0700 Subject: mm/vmalloc: fallback to a single page allocator Currently for order-0 pages we use a bulk-page allocator to get set of pages. From the other hand not allocating all pages is something that might occur. In that case we should fallbak to the single-page allocator trying to get missing pages, because it is more permissive(direct reclaim, etc). Introduce a vm_area_alloc_pages() function where the described logic is implemented. Link: https://lkml.kernel.org/r/20210521130718.GA17882@pc638.lan Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Mel Gorman Cc: Nicholas Piggin Cc: Hillf Danton Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 81 ++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ed0a32ea09ee..0c80caaf041a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2758,6 +2758,54 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ +static inline unsigned int +vm_area_alloc_pages(gfp_t gfp, int nid, + unsigned int order, unsigned long nr_pages, struct page **pages) +{ + unsigned int nr_allocated = 0; + + /* + * For order-0 pages we make use of bulk allocator, if + * the page array is partly or not at all populated due + * to fails, fallback to a single page allocator that is + * more permissive. + */ + if (!order) + nr_allocated = alloc_pages_bulk_array_node( + gfp, nid, nr_pages, pages); + else + /* + * Compound pages required for remap_vmalloc_page if + * high-order pages. + */ + gfp |= __GFP_COMP; + + /* High-order pages or fallback path if "bulk" fails. */ + while (nr_allocated < nr_pages) { + struct page *page; + int i; + + page = alloc_pages_node(nid, gfp, order); + if (unlikely(!page)) + break; + + /* + * Careful, we allocate and map page-order pages, but + * tracking is done per PAGE_SIZE page so as to keep the + * vm_struct APIs independent of the physical/mapped size. + */ + for (i = 0; i < (1U << order); i++) + pages[nr_allocated + i] = page + i; + + if (gfpflags_allow_blocking(gfp)) + cond_resched(); + + nr_allocated += 1U << order; + } + + return nr_allocated; +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -2790,37 +2838,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return NULL; } - area->nr_pages = 0; set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); - if (!page_order) { - area->nr_pages = alloc_pages_bulk_array_node( - gfp_mask, node, nr_small_pages, area->pages); - } else { - /* - * Careful, we allocate and map page_order pages, but tracking is done - * per PAGE_SIZE page so as to keep the vm_struct APIs independent of - * the physical/mapped size. - */ - while (area->nr_pages < nr_small_pages) { - struct page *page; - int i; - - /* Compound pages required for remap_vmalloc_page */ - page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); - if (unlikely(!page)) - break; - - for (i = 0; i < (1U << page_order); i++) - area->pages[area->nr_pages + i] = page + i; - - if (gfpflags_allow_blocking(gfp_mask)) - cond_resched(); - - area->nr_pages += 1U << page_order; - } - } + area->nr_pages = vm_area_alloc_pages(gfp_mask, node, + page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); @@ -2835,7 +2857,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } - if (vmap_pages_range(addr, addr + size, prot, area->pages, page_shift) < 0) { + if (vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift) < 0) { warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); -- cgit From a850e932df657c11f2030920dbda5f5621cef091 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Mon, 28 Jun 2021 19:40:27 -0700 Subject: mm: vmalloc: add cond_resched() in __vunmap() On non-preemptible kernel builds the watchdog can complain about soft lockups when vfree() is called against large vmalloc areas: [ 210.851798] kvmalloc-test: vmalloc(2199023255552) succeeded [ 238.654842] watchdog: BUG: soft lockup - CPU#181 stuck for 26s! [rmmod:5203] [ 238.662716] Modules linked in: kvmalloc_test(OE-) ... [ 238.772671] CPU: 181 PID: 5203 Comm: rmmod Tainted: G S OE 5.13.0-rc7+ #1 [ 238.781413] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS PLYXCRB1.86B.0553.D01.1809190614 09/19/2018 [ 238.792383] RIP: 0010:free_unref_page+0x52/0x60 [ 238.797447] Code: 48 c1 fd 06 48 89 ee e8 9c d0 ff ff 84 c0 74 19 9c 41 5c fa 48 89 ee 48 89 df e8 b9 ea ff ff 41 f7 c4 00 02 00 00 74 01 fb 5b <5d> 41 5c c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 f0 29 77 [ 238.818406] RSP: 0018:ffffb4d87868fe98 EFLAGS: 00000206 [ 238.824236] RAX: 0000000000000000 RBX: 000000001da0c945 RCX: ffffb4d87868fe40 [ 238.832200] RDX: ffffd79d3beed108 RSI: ffffd7998501dc08 RDI: ffff9c6fbffd7010 [ 238.840166] RBP: 000000000d518cbd R08: ffffd7998501dc08 R09: 0000000000000001 [ 238.848131] R10: 0000000000000000 R11: ffffd79d3beee088 R12: 0000000000000202 [ 238.856095] R13: ffff9e5be3eceec0 R14: 0000000000000000 R15: 0000000000000000 [ 238.864059] FS: 00007fe082c2d740(0000) GS:ffff9f4c69b40000(0000) knlGS:0000000000000000 [ 238.873089] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 238.879503] CR2: 000055a000611128 CR3: 000000f6094f6006 CR4: 00000000007706e0 [ 238.887467] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 238.895433] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 238.903397] PKRU: 55555554 [ 238.906417] Call Trace: [ 238.909149] __vunmap+0x17c/0x220 [ 238.912851] __x64_sys_delete_module+0x13a/0x250 [ 238.918008] ? syscall_trace_enter.isra.20+0x13c/0x1b0 [ 238.923746] do_syscall_64+0x39/0x80 [ 238.927740] entry_SYSCALL_64_after_hwframe+0x44/0xae Like in other range zapping routines that iterate over a large list, lets just add cond_resched() within __vunmap()'s page-releasing loop in order to avoid the watchdog splats. Link: https://lkml.kernel.org/r/20210622225030.478384-1-aquini@redhat.com Signed-off-by: Rafael Aquini Acked-by: Nicholas Piggin Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Aaron Tomlin Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0c80caaf041a..b2ec7f751bd0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2567,6 +2567,7 @@ static void __vunmap(const void *addr, int deallocate_pages) BUG_ON(!page); __free_pages(page, page_order); + cond_resched(); } atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); -- cgit From 4469c0f17ec63dcc8c9ed512f4330b566c2c0d34 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 28 Jun 2021 19:40:30 -0700 Subject: printk: introduce dump_stack_lvl() dump_stack() is used for many different cases, which may require a log level consistent with other kernel messages surrounding the dump_stack() call. Without that, certain systems that are configured to ignore the default level messages will miss stack traces in critical error reports. This patch introduces dump_stack_lvl() that behaves similarly to dump_stack(), but accepts a custom log level. The old dump_stack() becomes equal to dump_stack_lvl(KERN_DEFAULT). A somewhat similar patch has been proposed in 2012: https://lore.kernel.org/lkml/1332493269.2359.9.camel@hebo/ , but wasn't merged. [elver@google.com: add missing dump_stack_lvl() stub if CONFIG_PRINTK=n] Link: https://lkml.kernel.org/r/YJ0KAM0hQev1AmWe@elver.google.com Link: https://lkml.kernel.org/r/20210506105405.3535023-1-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Petr Mladek Cc: Ingo Molnar Cc: he, bo Cc: Yanmin Zhang Cc: Prasad Sodagudi Cc: Dmitry Vyukov Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 5 +++++ lib/dump_stack.c | 20 +++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index fe7eb2351610..f589b8b60806 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -206,6 +206,7 @@ void __init setup_log_buf(int early); __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); +extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; extern void printk_safe_flush(void); extern void printk_safe_flush_on_panic(void); @@ -269,6 +270,10 @@ static inline void show_regs_print_info(const char *log_lvl) { } +static inline void dump_stack_lvl(const char *log_lvl) +{ +} + static inline void dump_stack(void) { } diff --git a/lib/dump_stack.c b/lib/dump_stack.c index f5a33b6f773f..586e3f2c6a15 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -73,10 +73,10 @@ void show_regs_print_info(const char *log_lvl) dump_stack_print_info(log_lvl); } -static void __dump_stack(void) +static void __dump_stack(const char *log_lvl) { - dump_stack_print_info(KERN_DEFAULT); - show_stack(NULL, NULL, KERN_DEFAULT); + dump_stack_print_info(log_lvl); + show_stack(NULL, NULL, log_lvl); } /** @@ -87,7 +87,7 @@ static void __dump_stack(void) #ifdef CONFIG_SMP static atomic_t dump_lock = ATOMIC_INIT(-1); -asmlinkage __visible void dump_stack(void) +asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { unsigned long flags; int was_locked; @@ -117,7 +117,7 @@ retry: goto retry; } - __dump_stack(); + __dump_stack(log_lvl); if (!was_locked) atomic_set(&dump_lock, -1); @@ -125,9 +125,15 @@ retry: local_irq_restore(flags); } #else -asmlinkage __visible void dump_stack(void) +asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { - __dump_stack(); + __dump_stack(log_lvl); } #endif +EXPORT_SYMBOL(dump_stack_lvl); + +asmlinkage __visible void dump_stack(void) +{ + dump_stack_lvl(KERN_DEFAULT); +} EXPORT_SYMBOL(dump_stack); -- cgit From 336abff6e8723c6b98e141372956e6c0c55e8ea4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 28 Jun 2021 19:40:33 -0700 Subject: kasan: use dump_stack_lvl(KERN_ERR) to print stacks Most of the contents of KASAN reports are printed with pr_err(), so use a consistent logging level to print the memory access stacks. Link: https://lkml.kernel.org/r/20210506105405.3535023-2-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Prasad Sodagudi Cc: Dmitry Vyukov Cc: he, bo Cc: Ingo Molnar Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Yanmin Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 14bd51ea2348..8fff1825b22c 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -230,7 +230,7 @@ static void print_address_description(void *addr, u8 tag) { struct page *page = kasan_addr_to_page(addr); - dump_stack(); + dump_stack_lvl(KERN_ERR); pr_err("\n"); if (page && PageSlab(page)) { @@ -375,7 +375,7 @@ void kasan_report_async(void) pr_err("BUG: KASAN: invalid-access\n"); pr_err("Asynchronous mode enabled: no access details available\n"); pr_err("\n"); - dump_stack(); + dump_stack_lvl(KERN_ERR); end_report(&flags, 0); } #endif /* CONFIG_KASAN_HW_TAGS */ @@ -420,7 +420,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, pr_err("\n"); print_memory_metadata(info.first_bad_addr); } else { - dump_stack(); + dump_stack_lvl(KERN_ERR); } end_report(&flags, addr); -- cgit From 3ff16d30f593d80a958104ee06a94562a12c5879 Mon Sep 17 00:00:00 2001 From: David Gow Date: Mon, 28 Jun 2021 19:40:36 -0700 Subject: kasan: test: improve failure message in KUNIT_EXPECT_KASAN_FAIL() The KUNIT_EXPECT_KASAN_FAIL() macro currently uses KUNIT_EXPECT_EQ() to compare fail_data.report_expected and fail_data.report_found. This always gave a somewhat useless error message on failure, but the addition of extra compile-time checking with READ_ONCE() has caused it to get much longer, and be truncated before anything useful is displayed. Instead, just check fail_data.report_found by hand (we've just set report_expected to 'true'), and print a better failure message with KUNIT_FAIL(). Because of this, report_expected is no longer used anywhere, and can be removed. Beforehand, a failure in: KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]); would have looked like: [22:00:34] [FAILED] vmalloc_oob [22:00:34] # vmalloc_oob: EXPECTATION FAILED at lib/test_kasan.c:991 [22:00:34] Expected ({ do { extern void __compiletime_assert_705(void) __attribute__((__error__("Unsupported access size for {READ,WRITE}_ONCE()."))); if (!((sizeof(fail_data.report_expected) == sizeof(char) || sizeof(fail_data.repp [22:00:34] not ok 45 - vmalloc_oob With this change, it instead looks like: [22:04:04] [FAILED] vmalloc_oob [22:04:04] # vmalloc_oob: EXPECTATION FAILED at lib/test_kasan.c:993 [22:04:04] KASAN failure expected in "((volatile char *)area)[3100]", but none occurred [22:04:04] not ok 45 - vmalloc_oob Also update the example failure in the documentation to reflect this. Link: https://lkml.kernel.org/r/20210606005531.165954-1-davidgow@google.com Signed-off-by: David Gow Reviewed-by: Andrey Konovalov Reviewed-by: Marco Elver Acked-by: Brendan Higgins Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Daniel Axtens Cc: David Gow Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 9 ++++----- include/linux/kasan.h | 1 - lib/test_kasan.c | 11 +++++------ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index d3f335ffc751..83ec4a556c19 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -447,11 +447,10 @@ When a test fails due to a failed ``kmalloc``:: When a test fails due to a missing KASAN report:: - # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:629 - Expected kasan_data->report_expected == kasan_data->report_found, but - kasan_data->report_expected == 1 - kasan_data->report_found == 0 - not ok 28 - kmalloc_double_kzfree + # kmalloc_double_kzfree: EXPECTATION FAILED at lib/test_kasan.c:974 + KASAN failure expected in "kfree_sensitive(ptr)", but none occurred + not ok 44 - kmalloc_double_kzfree + At the end the cumulative status of all KASAN tests is printed. On success:: diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b1678a61e6a7..18cd5ec2f469 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -17,7 +17,6 @@ struct task_struct; /* kasan_data struct is used in KUnit tests for KASAN expected failures */ struct kunit_kasan_expectation { - bool report_expected; bool report_found; }; diff --git a/lib/test_kasan.c b/lib/test_kasan.c index cacbbbdef768..44e08f4d9c52 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -55,7 +55,6 @@ static int kasan_test_init(struct kunit *test) multishot = kasan_save_enable_multi_shot(); kasan_set_tagging_report_once(false); fail_data.report_found = false; - fail_data.report_expected = false; kunit_add_named_resource(test, NULL, NULL, &resource, "kasan_data", &fail_data); return 0; @@ -94,20 +93,20 @@ static void kasan_test_exit(struct kunit *test) !kasan_async_mode_enabled()) \ migrate_disable(); \ KUNIT_EXPECT_FALSE(test, READ_ONCE(fail_data.report_found)); \ - WRITE_ONCE(fail_data.report_expected, true); \ barrier(); \ expression; \ barrier(); \ - KUNIT_EXPECT_EQ(test, \ - READ_ONCE(fail_data.report_expected), \ - READ_ONCE(fail_data.report_found)); \ + if (!READ_ONCE(fail_data.report_found)) { \ + KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \ + "expected in \"" #expression \ + "\", but none occurred"); \ + } \ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) { \ if (READ_ONCE(fail_data.report_found)) \ kasan_enable_tagging_sync(); \ migrate_enable(); \ } \ WRITE_ONCE(fail_data.report_found, false); \ - WRITE_ONCE(fail_data.report_expected, false); \ } while (0) #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ -- cgit From 158f25522ca8cc87f512a03ed5e2a5923bd37eb3 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Mon, 28 Jun 2021 19:40:39 -0700 Subject: kasan: allow an architecture to disable inline instrumentation Patch series "KASAN core changes for ppc64 radix KASAN", v16. Building on the work of Christophe, Aneesh and Balbir, I've ported KASAN to 64-bit Book3S kernels running on the Radix MMU. I've been trying this for a while, but we keep having collisions between the kasan code in the mm tree and the code I want to put in to the ppc tree. This series just contains the kasan core changes that we need. There should be no noticeable changes to other platforms. This patch (of 4): For annoying architectural reasons, it's very difficult to support inline instrumentation on powerpc64.* Add a Kconfig flag to allow an arch to disable inline. (It's a bit annoying to be 'backwards', but I'm not aware of any way to have an arch force a symbol to be 'n', rather than 'y'.) We also disable stack instrumentation in this case as it does things that are functionally equivalent to inline instrumentation, namely adding code that touches the shadow directly without going through a C helper. * on ppc64 atm, the shadow lives in virtual memory and isn't accessible in real mode. However, before we turn on virtual memory, we parse the device tree to determine which platform and MMU we're running under. That calls generic DT code, which is instrumented. Inline instrumentation in DT would unconditionally attempt to touch the shadow region, which we won't have set up yet, and would crash. We can make outline mode wait for the arch to be ready, but we can't change what the compiler inserts for inline mode. Link: https://lkml.kernel.org/r/20210624034050.511391-1-dja@axtens.net Link: https://lkml.kernel.org/r/20210624034050.511391-2-dja@axtens.net Signed-off-by: Daniel Axtens Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Christophe Leroy Cc: Aneesh Kumar K.V Cc: Balbir Singh Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kasan | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index cffc2ebbf185..c3b228828a80 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -12,6 +12,13 @@ config HAVE_ARCH_KASAN_HW_TAGS config HAVE_ARCH_KASAN_VMALLOC bool +config ARCH_DISABLE_KASAN_INLINE + bool + help + An architecture might not support inline instrumentation. + When this option is selected, inline and stack instrumentation are + disabled. + config CC_HAS_KASAN_GENERIC def_bool $(cc-option, -fsanitize=kernel-address) @@ -130,6 +137,7 @@ config KASAN_OUTLINE config KASAN_INLINE bool "Inline instrumentation" + depends on !ARCH_DISABLE_KASAN_INLINE help Compiler directly inserts code checking shadow memory before memory accesses. This is faster than outline (in some workloads @@ -141,6 +149,7 @@ endchoice config KASAN_STACK bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && !COMPILE_TEST depends on KASAN_GENERIC || KASAN_SW_TAGS + depends on !ARCH_DISABLE_KASAN_INLINE default y if CC_IS_GCC help The LLVM stack address sanitizer has a know problem that @@ -154,6 +163,9 @@ config KASAN_STACK but clang users can still enable it for builds without CONFIG_COMPILE_TEST. On gcc it is assumed to always be safe to use and enabled by default. + If the architecture disables inline instrumentation, stack + instrumentation is also disabled as it adds inline-style + instrumentation that is run unconditionally. config KASAN_SW_TAGS_IDENTIFY bool "Enable memory corruption identification" -- cgit From af3751f3c2b6282bebcb56c35bbe4c8b671f80aa Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Mon, 28 Jun 2021 19:40:42 -0700 Subject: kasan: allow architectures to provide an outline readiness check Allow architectures to define a kasan_arch_is_ready() hook that bails out of any function that's about to touch the shadow unless the arch says that it is ready for the memory to be accessed. This is fairly uninvasive and should have a negligible performance penalty. This will only work in outline mode, so an arch must specify ARCH_DISABLE_KASAN_INLINE if it requires this. Link: https://lkml.kernel.org/r/20210624034050.511391-3-dja@axtens.net Signed-off-by: Daniel Axtens Reviewed-by: Marco Elver Suggested-by: Christophe Leroy Reviewed-by: Andrey Konovalov Cc: Balbir Singh Cc: Aneesh Kumar K.V Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 3 +++ mm/kasan/generic.c | 3 +++ mm/kasan/kasan.h | 6 ++++++ mm/kasan/shadow.c | 6 ++++++ 4 files changed, 18 insertions(+) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2586d3718600..267500896b1e 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -331,6 +331,9 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, u8 tag; void *tagged_object; + if (!kasan_arch_is_ready()) + return false; + tag = get_tag(object); tagged_object = object; object = kasan_reset_tag(object); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 53cbf28859b5..c3f5ba7a294a 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -163,6 +163,9 @@ static __always_inline bool check_region_inline(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { + if (!kasan_arch_is_ready()) + return true; + if (unlikely(size == 0)) return true; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8f450bc28045..4dbc8def64f4 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -449,6 +449,12 @@ static inline void kasan_poison_last_granule(const void *address, size_t size) { #endif /* CONFIG_KASAN_GENERIC */ +#ifndef kasan_arch_is_ready +static inline bool kasan_arch_is_ready(void) { return true; } +#elif !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE) +#error kasan_arch_is_ready only works in KASAN generic outline mode! +#endif + /* * Exported functions for interfaces called from assembly or from generated * code. Declarations here to avoid warning about missing declarations. diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 082ee5b6d9a1..8d95ee52d019 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -73,6 +73,9 @@ void kasan_poison(const void *addr, size_t size, u8 value, bool init) { void *shadow_start, *shadow_end; + if (!kasan_arch_is_ready()) + return; + /* * Perform shadow offset calculation based on untagged address, as * some of the callers (e.g. kasan_poison_object_data) pass tagged @@ -99,6 +102,9 @@ EXPORT_SYMBOL(kasan_poison); #ifdef CONFIG_KASAN_GENERIC void kasan_poison_last_granule(const void *addr, size_t size) { + if (!kasan_arch_is_ready()) + return; + if (size & KASAN_GRANULE_MASK) { u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size); *shadow = size & KASAN_GRANULE_MASK; -- cgit From c0f8aa4fa815daacb6eca52cae04820d6aecb7c2 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Mon, 28 Jun 2021 19:40:46 -0700 Subject: mm: define default MAX_PTRS_PER_* in include/pgtable.h Commit c65e774fb3f6 ("x86/mm: Make PGDIR_SHIFT and PTRS_PER_P4D variable") made PTRS_PER_P4D variable on x86 and introduced MAX_PTRS_PER_P4D as a constant for cases which need a compile-time constant (e.g. fixed-size arrays). powerpc likewise has boot-time selectable MMU features which can cause other mm "constants" to vary. For KASAN, we have some static PTE/PMD/PUD/P4D arrays so we need compile-time maximums for all these constants. Extend the MAX_PTRS_PER_ idiom, and place default definitions in include/pgtable.h. These define MAX_PTRS_PER_x to be PTRS_PER_x unless an architecture has defined MAX_PTRS_PER_x in its arch headers. Clean up pgtable-nop4d.h and s390's MAX_PTRS_PER_P4D definitions while we're at it: both can just pick up the default now. Link: https://lkml.kernel.org/r/20210624034050.511391-4-dja@axtens.net Signed-off-by: Daniel Axtens Acked-by: Andrey Konovalov Reviewed-by: Christophe Leroy Reviewed-by: Marco Elver Cc: Aneesh Kumar K.V Cc: Balbir Singh Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 2 -- include/asm-generic/pgtable-nop4d.h | 1 - include/linux/pgtable.h | 22 ++++++++++++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 29c7ecd5ad1d..b38f7b781564 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -344,8 +344,6 @@ static inline int is_module_addr(void *addr) #define PTRS_PER_P4D _CRST_ENTRIES #define PTRS_PER_PGD _CRST_ENTRIES -#define MAX_PTRS_PER_P4D PTRS_PER_P4D - /* * Segment table and region3 table entry encoding * (R = read-only, I = invalid, y = young bit): diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h index ce2cbb3c380f..2f6b1befb129 100644 --- a/include/asm-generic/pgtable-nop4d.h +++ b/include/asm-generic/pgtable-nop4d.h @@ -9,7 +9,6 @@ typedef struct { pgd_t pgd; } p4d_t; #define P4D_SHIFT PGDIR_SHIFT -#define MAX_PTRS_PER_P4D 1 #define PTRS_PER_P4D 1 #define P4D_SIZE (1UL << P4D_SHIFT) #define P4D_MASK (~(P4D_SIZE-1)) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a43047b1030d..c32600c9e1ad 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1592,4 +1592,26 @@ typedef unsigned int pgtbl_mod_mask; #define pte_leaf_size(x) PAGE_SIZE #endif +/* + * Some architectures have MMUs that are configurable or selectable at boot + * time. These lead to variable PTRS_PER_x. For statically allocated arrays it + * helps to have a static maximum value. + */ + +#ifndef MAX_PTRS_PER_PTE +#define MAX_PTRS_PER_PTE PTRS_PER_PTE +#endif + +#ifndef MAX_PTRS_PER_PMD +#define MAX_PTRS_PER_PMD PTRS_PER_PMD +#endif + +#ifndef MAX_PTRS_PER_PUD +#define MAX_PTRS_PER_PUD PTRS_PER_PUD +#endif + +#ifndef MAX_PTRS_PER_P4D +#define MAX_PTRS_PER_P4D PTRS_PER_P4D +#endif + #endif /* _LINUX_PGTABLE_H */ -- cgit From cb32c9c5d45662770160e0055cb672fd6e0813e8 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Mon, 28 Jun 2021 19:40:49 -0700 Subject: kasan: use MAX_PTRS_PER_* for early shadow tables powerpc has a variable number of PTRS_PER_*, set at runtime based on the MMU that the kernel is booted under. This means the PTRS_PER_* are no longer constants, and therefore breaks the build. Switch to using MAX_PTRS_PER_*, which are constant. Link: https://lkml.kernel.org/r/20210624034050.511391-5-dja@axtens.net Signed-off-by: Daniel Axtens Suggested-by: Christophe Leroy Suggested-by: Balbir Singh Reviewed-by: Christophe Leroy Reviewed-by: Balbir Singh Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Aneesh Kumar K.V Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 6 +++--- mm/kasan/init.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 18cd5ec2f469..8d83bbffcfbb 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -40,9 +40,9 @@ struct kunit_kasan_expectation { #endif extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; -extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]; -extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD]; -extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD]; +extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]; +extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD]; +extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD]; extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; int kasan_populate_early_shadow(const void *shadow_start, diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 348f31d15a97..cc64ed6858c6 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -41,7 +41,7 @@ static inline bool kasan_p4d_table(pgd_t pgd) } #endif #if CONFIG_PGTABLE_LEVELS > 3 -pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss; +pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss; static inline bool kasan_pud_table(p4d_t p4d) { return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); @@ -53,7 +53,7 @@ static inline bool kasan_pud_table(p4d_t p4d) } #endif #if CONFIG_PGTABLE_LEVELS > 2 -pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss; +pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss; static inline bool kasan_pmd_table(pud_t pud) { return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); @@ -64,7 +64,7 @@ static inline bool kasan_pmd_table(pud_t pud) return false; } #endif -pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS] +pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS] __page_aligned_bss; static inline bool kasan_pte_table(pmd_t pmd) -- cgit From f06f78ab48fb90cfbef5289e5556704b74c46b7a Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Mon, 28 Jun 2021 19:40:52 -0700 Subject: kasan: rename CONFIG_KASAN_SW_TAGS_IDENTIFY to CONFIG_KASAN_TAGS_IDENTIFY Patch series "kasan: add memory corruption identification support for hw tag-based kasan", v4. Add memory corruption identification for hardware tag-based KASAN mode. This patch (of 3): Rename CONFIG_KASAN_SW_TAGS_IDENTIFY to CONFIG_KASAN_TAGS_IDENTIFY in order to be compatible with hardware tag-based mode. Link: https://lkml.kernel.org/r/20210626100931.22794-1-Kuan-Ying.Lee@mediatek.com Link: https://lkml.kernel.org/r/20210626100931.22794-2-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Suggested-by: Marco Elver Reviewed-by: Alexander Potapenko Reviewed-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Matthias Brugger Cc: Chinwen Chang Cc: Nicholas Tang Cc: Kuan-Ying Lee Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kasan | 2 +- mm/kasan/kasan.h | 4 ++-- mm/kasan/report_sw_tags.c | 2 +- mm/kasan/sw_tags.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index c3b228828a80..fdb4a08dba83 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -167,7 +167,7 @@ config KASAN_STACK instrumentation is also disabled as it adds inline-style instrumentation that is run unconditionally. -config KASAN_SW_TAGS_IDENTIFY +config KASAN_TAGS_IDENTIFY bool "Enable memory corruption identification" depends on KASAN_SW_TAGS help diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4dbc8def64f4..2317d0943a07 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -153,7 +153,7 @@ struct kasan_track { depot_stack_handle_t stack; }; -#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY +#ifdef CONFIG_KASAN_TAGS_IDENTIFY #define KASAN_NR_FREE_STACKS 5 #else #define KASAN_NR_FREE_STACKS 1 @@ -170,7 +170,7 @@ struct kasan_alloc_meta { #else struct kasan_track free_track[KASAN_NR_FREE_STACKS]; #endif -#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY +#ifdef CONFIG_KASAN_TAGS_IDENTIFY u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; u8 free_track_idx; #endif diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 3d20d3451d9e..821a14a19a92 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -31,7 +31,7 @@ const char *kasan_get_bug_type(struct kasan_access_info *info) { -#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY +#ifdef CONFIG_KASAN_TAGS_IDENTIFY struct kasan_alloc_meta *alloc_meta; struct kmem_cache *cache; struct page *page; diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 9df8e7f69e87..0d6e5e976231 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -177,7 +177,7 @@ void kasan_set_free_info(struct kmem_cache *cache, if (!alloc_meta) return; -#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY +#ifdef CONFIG_KASAN_TAGS_IDENTIFY idx = alloc_meta->free_track_idx; alloc_meta->free_pointer_tag[idx] = tag; alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; @@ -196,7 +196,7 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, if (!alloc_meta) return NULL; -#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY +#ifdef CONFIG_KASAN_TAGS_IDENTIFY for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { if (alloc_meta->free_pointer_tag[i] == tag) break; -- cgit From a0503b8a0b3c8ef1be55744a248bffb8f533d227 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Mon, 28 Jun 2021 19:40:55 -0700 Subject: kasan: integrate the common part of two KASAN tag-based modes 1. Move kasan_get_free_track() and kasan_set_free_info() into tags.c and combine these two functions for SW_TAGS and HW_TAGS kasan mode. 2. Move kasan_get_bug_type() to report_tags.c and make this function compatible for SW_TAGS and HW_TAGS kasan mode. Link: https://lkml.kernel.org/r/20210626100931.22794-3-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Suggested-by: Marco Elver Suggested-by: Greg Kroah-Hartman Reviewed-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Chinwen Chang Cc: Matthias Brugger Cc: Nicholas Tang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/Makefile | 4 ++-- mm/kasan/hw_tags.c | 22 ------------------ mm/kasan/report_hw_tags.c | 5 ---- mm/kasan/report_sw_tags.c | 43 ---------------------------------- mm/kasan/report_tags.c | 51 ++++++++++++++++++++++++++++++++++++++++ mm/kasan/sw_tags.c | 41 -------------------------------- mm/kasan/tags.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 112 insertions(+), 113 deletions(-) create mode 100644 mm/kasan/report_tags.c create mode 100644 mm/kasan/tags.c diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 9fe39a66388a..adcd9acaef61 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -37,5 +37,5 @@ CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) obj-$(CONFIG_KASAN) := common.o report.o obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o -obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o -obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o +obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o +obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 4004388b4e4b..d867b22ddbb7 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -216,28 +216,6 @@ void __init kasan_init_hw_tags(void) pr_info("KernelAddressSanitizer initialized\n"); } -void kasan_set_free_info(struct kmem_cache *cache, - void *object, u8 tag) -{ - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - kasan_set_track(&alloc_meta->free_track[0], GFP_NOWAIT); -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - - return &alloc_meta->free_track[0]; -} - #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_set_tagging_report_once(bool state) diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index 42b2168755d6..5dbbbb930e7a 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -15,11 +15,6 @@ #include "kasan.h" -const char *kasan_get_bug_type(struct kasan_access_info *info) -{ - return "invalid-access"; -} - void *kasan_find_first_bad_addr(void *addr, size_t size) { return kasan_reset_tag(addr); diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 821a14a19a92..d2298c357834 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -29,49 +29,6 @@ #include "kasan.h" #include "../slab.h" -const char *kasan_get_bug_type(struct kasan_access_info *info) -{ -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - struct kasan_alloc_meta *alloc_meta; - struct kmem_cache *cache; - struct page *page; - const void *addr; - void *object; - u8 tag; - int i; - - tag = get_tag(info->access_addr); - addr = kasan_reset_tag(info->access_addr); - page = kasan_addr_to_page(addr); - if (page && PageSlab(page)) { - cache = page->slab_cache; - object = nearest_obj(cache, page, (void *)addr); - alloc_meta = kasan_get_alloc_meta(cache, object); - - if (alloc_meta) { - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - return "use-after-free"; - } - } - return "out-of-bounds"; - } - -#endif - /* - * If access_size is a negative number, then it has reason to be - * defined as out-of-bounds bug type. - * - * Casting negative numbers to size_t would indeed turn up as - * a large size_t and its value will be larger than ULONG_MAX/2, - * so that this can qualify as out-of-bounds. - */ - if (info->access_addr + info->access_size < info->access_addr) - return "out-of-bounds"; - - return "invalid-access"; -} - void *kasan_find_first_bad_addr(void *addr, size_t size) { u8 tag = get_tag(addr); diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c new file mode 100644 index 000000000000..8a319fc16dab --- /dev/null +++ b/mm/kasan/report_tags.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Copyright (c) 2020 Google, Inc. + */ + +#include "kasan.h" +#include "../slab.h" + +const char *kasan_get_bug_type(struct kasan_access_info *info) +{ +#ifdef CONFIG_KASAN_TAGS_IDENTIFY + struct kasan_alloc_meta *alloc_meta; + struct kmem_cache *cache; + struct page *page; + const void *addr; + void *object; + u8 tag; + int i; + + tag = get_tag(info->access_addr); + addr = kasan_reset_tag(info->access_addr); + page = kasan_addr_to_page(addr); + if (page && PageSlab(page)) { + cache = page->slab_cache; + object = nearest_obj(cache, page, (void *)addr); + alloc_meta = kasan_get_alloc_meta(cache, object); + + if (alloc_meta) { + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { + if (alloc_meta->free_pointer_tag[i] == tag) + return "use-after-free"; + } + } + return "out-of-bounds"; + } +#endif + + /* + * If access_size is a negative number, then it has reason to be + * defined as out-of-bounds bug type. + * + * Casting negative numbers to size_t would indeed turn up as + * a large size_t and its value will be larger than ULONG_MAX/2, + * so that this can qualify as out-of-bounds. + */ + if (info->access_addr + info->access_size < info->access_addr) + return "out-of-bounds"; + + return "invalid-access"; +} diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 0d6e5e976231..675e67375fb5 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -166,44 +166,3 @@ void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) kasan_poison((void *)addr, size, tag, false); } EXPORT_SYMBOL(__hwasan_tag_memory); - -void kasan_set_free_info(struct kmem_cache *cache, - void *object, u8 tag) -{ - struct kasan_alloc_meta *alloc_meta; - u8 idx = 0; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return; - -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - idx = alloc_meta->free_track_idx; - alloc_meta->free_pointer_tag[idx] = tag; - alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; -#endif - - kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - struct kasan_alloc_meta *alloc_meta; - int i = 0; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - break; - } - if (i == KASAN_NR_FREE_STACKS) - i = alloc_meta->free_track_idx; -#endif - - return &alloc_meta->free_track[i]; -} diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c new file mode 100644 index 000000000000..8f48b9502a17 --- /dev/null +++ b/mm/kasan/tags.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains common tag-based KASAN code. + * + * Copyright (c) 2018 Google, Inc. + * Copyright (c) 2020 Google, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kasan.h" + +void kasan_set_free_info(struct kmem_cache *cache, + void *object, u8 tag) +{ + struct kasan_alloc_meta *alloc_meta; + u8 idx = 0; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return; + +#ifdef CONFIG_KASAN_TAGS_IDENTIFY + idx = alloc_meta->free_track_idx; + alloc_meta->free_pointer_tag[idx] = tag; + alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; +#endif + + kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); +} + +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + struct kasan_alloc_meta *alloc_meta; + int i = 0; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + +#ifdef CONFIG_KASAN_TAGS_IDENTIFY + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { + if (alloc_meta->free_pointer_tag[i] == tag) + break; + } + if (i == KASAN_NR_FREE_STACKS) + i = alloc_meta->free_track_idx; +#endif + + return &alloc_meta->free_track[i]; +} -- cgit From 7a22bdc3c443d5abc420df1381e425b49e8901a3 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Mon, 28 Jun 2021 19:40:58 -0700 Subject: kasan: add memory corruption identification support for hardware tag-based mode Add memory corruption identification support for hardware tag-based mode. We store one old free pointer tag and free backtrace instead of five because hardware tag-based kasan only has 16 different tags. If we store as many stacks as SW tag-based kasan does(5 stacks), there is high probability to find the same tag in the stacks when out-of-bound issues happened and we will mistake out-of-bound issue for use-after-free. Link: https://lkml.kernel.org/r/20210626100931.22794-4-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Suggested-by: Marco Elver Reviewed-by: Alexander Potapenko Reviewed-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Chinwen Chang Cc: Greg Kroah-Hartman Cc: Matthias Brugger Cc: Nicholas Tang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kasan | 2 +- mm/kasan/kasan.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index fdb4a08dba83..1e2d10f86011 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -169,7 +169,7 @@ config KASAN_STACK config KASAN_TAGS_IDENTIFY bool "Enable memory corruption identification" - depends on KASAN_SW_TAGS + depends on KASAN_SW_TAGS || KASAN_HW_TAGS help This option enables best-effort identification of bug type (use-after-free or out-of-bounds) at the cost of increased diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 2317d0943a07..98e3059bfea4 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -153,7 +153,7 @@ struct kasan_track { depot_stack_handle_t stack; }; -#ifdef CONFIG_KASAN_TAGS_IDENTIFY +#if defined(CONFIG_KASAN_TAGS_IDENTIFY) && defined(CONFIG_KASAN_SW_TAGS) #define KASAN_NR_FREE_STACKS 5 #else #define KASAN_NR_FREE_STACKS 1 -- cgit From c5a54c706e04a4ba7c4e3428776ac9e44aec17ea Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Mon, 28 Jun 2021 19:41:02 -0700 Subject: mm: report which part of mem is being freed on initmem case Add the details for figuring out which parts of the kernel image is being freed on initmem case. Before: Freeing unused kernel memory: 1024K After: Freeing unused kernel image (initmem) memory: 1024K Link: https://lkml.kernel.org/r/1622706274-4533-1-git-send-email-js07.lee@samsung.com Signed-off-by: Jungseung Lee Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1a98b5447a3b..f08e9de92fc5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2416,7 +2416,7 @@ static inline unsigned long free_initmem_default(int poison) extern char __init_begin[], __init_end[]; return free_reserved_area(&__init_begin, &__init_end, - poison, "unused kernel"); + poison, "unused kernel image (initmem)"); } static inline unsigned long get_num_physpages(void) -- cgit From b19bd1c976afeefc2ebba3d4dae8a4c296dae67f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:41:04 -0700 Subject: mm/mmzone.h: simplify is_highmem_idx() There is a lot of historical ifdefery in is_highmem_idx() and its helper zone_movable_is_highmem() that was required because of two different paths for nodes and zones initialization that were selected at compile time. Until commit 3f08a302f533 ("mm: remove CONFIG_HAVE_MEMBLOCK_NODE_MAP option") the movable_zone variable was only available for configurations that had CONFIG_HAVE_MEMBLOCK_NODE_MAP enabled so the test in zone_movable_is_highmem() used that variable only for such configurations. For other configurations the test checked if the index of ZONE_MOVABLE was greater by 1 than the index of ZONE_HIGMEM and then movable zone was considered a highmem zone. Needless to say, ZONE_MOVABLE - 1 equals ZONE_HIGHMEM by definition when CONFIG_HIGHMEM=y. Commit 3f08a302f533 ("mm: remove CONFIG_HAVE_MEMBLOCK_NODE_MAP option") made movable_zone variable always available. Since this variable is set to ZONE_HIGHMEM if CONFIG_HIGHMEM is enabled and highmem zone is populated, it is enough to check whether zone_idx == ZONE_MOVABLE && movable_zone == ZONE_HIGMEM to test if zone index points to a highmem zone. Remove zone_movable_is_highmem() that is not used anywhere except is_highmem_idx() and use the test above in is_highmem_idx() instead. Link: https://lkml.kernel.org/r/20210426141927.1314326-3-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: Anshuman Khandual Cc: Jonathan Corbet Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0d53eba1c383..c2bfefd34b59 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -982,22 +982,11 @@ static inline void zone_set_nid(struct zone *zone, int nid) {} extern int movable_zone; -#ifdef CONFIG_HIGHMEM -static inline int zone_movable_is_highmem(void) -{ -#ifdef CONFIG_NEED_MULTIPLE_NODES - return movable_zone == ZONE_HIGHMEM; -#else - return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; -#endif -} -#endif - static inline int is_highmem_idx(enum zone_type idx) { #ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM || - (idx == ZONE_MOVABLE && zone_movable_is_highmem())); + (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM)); #else return 0; #endif -- cgit From d2f07ec052ac1a720d6f1919e3dee7d73f04d495 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:07 -0700 Subject: mm: make __dump_page static Patch series "Constify struct page arguments". While working on various solutions to the 32-bit struct page size regression, one of the problems I found was the networking stack expects to be able to pass const struct page pointers around, and the mm doesn't provide a lot of const-friendly functions to call. The root tangle of problems is that a lot of functions call VM_BUG_ON_PAGE(), which calls dump_page(), which calls a lot of functions which don't take a const struct page (but could be const). This patch (of 6): The only caller of __dump_page() now opencodes dump_page(), so remove it as an externally visible symbol. Link: https://lkml.kernel.org/r/20210416231531.2521383-1-willy@infradead.org Link: https://lkml.kernel.org/r/20210416231531.2521383-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 3 +-- mm/debug.c | 2 +- mm/page_alloc.c | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 5d0767cb424a..1935d4c72d10 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -9,8 +9,7 @@ struct page; struct vm_area_struct; struct mm_struct; -extern void dump_page(struct page *page, const char *reason); -extern void __dump_page(struct page *page, const char *reason); +void dump_page(struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); diff --git a/mm/debug.c b/mm/debug.c index 0bdda8407f71..84cdcd0f7bd3 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -42,7 +42,7 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; -void __dump_page(struct page *page, const char *reason) +static void __dump_page(struct page *page, const char *reason) { struct page *head = compound_head(page); struct address_space *mapping; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bf03c76504b..4087340fca32 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -658,8 +658,7 @@ static void bad_page(struct page *page, const char *reason) pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - __dump_page(page, reason); - dump_page_owner(page); + dump_page(page, reason); print_modules(); dump_stack(); -- cgit From 691d9497285a90346a67bfee5cac2007e5e18405 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 28 Jun 2021 19:41:10 -0700 Subject: mm/page_alloc: bail out on fatal signal during reclaim/compaction retry attempt A customer experienced a low-memory situation and decided to issue a SIGKILL (i.e. a fatal signal). Instead of promptly terminating as one would expect, the aforementioned task remained unresponsive. Further investigation indicated that the task was "stuck" in the reclaim/compaction retry loop. Now, it does not make sense to retry compaction when a fatal signal is pending. In the context of try_to_compact_pages(), indeed COMPACT_SKIPPED can be returned; albeit, not every zone, on the zone list, would be considered in the case a fatal signal is found to be pending. Yet, in should_compact_retry(), given the last known compaction result, each zone, on the zone list, can be considered/or checked (see compaction_zonelist_suitable()). For example, if a zone was found to succeed, then reclaim/compaction would be tried again (notwithstanding the above). This patch ensures that compaction is not needlessly retried irrespective of the last known compaction result e.g. if it was skipped, in the unlikely case a fatal signal is found pending. So, OOM is at least attempted. Link: https://lkml.kernel.org/r/20210520142901.3371299-1-atomlin@redhat.com Signed-off-by: Aaron Tomlin Reviewed-by: Vlastimil Babka Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4087340fca32..ea1efbb06e40 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4251,6 +4251,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, if (!order) return false; + if (fatal_signal_pending(current)) + return false; + if (compaction_made_progress(compact_result)) (*compaction_retries)++; -- cgit From be7c701fd42c2dd124ec5ce3493ec72e217738a8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:13 -0700 Subject: mm/debug: factor PagePoisoned out of __dump_page Move the PagePoisoned test into dump_page(). Skip the hex print for poisoned pages -- we know they're full of ffffffff. Move the reason printing from __dump_page() to dump_page(). Link: https://lkml.kernel.org/r/20210416231531.2521383-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index 84cdcd0f7bd3..e73fe0a8ec3d 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -42,11 +42,10 @@ const struct trace_print_flags vmaflag_names[] = { {0, NULL} }; -static void __dump_page(struct page *page, const char *reason) +static void __dump_page(struct page *page) { struct page *head = compound_head(page); struct address_space *mapping; - bool page_poisoned = PagePoisoned(page); bool compound = PageCompound(page); /* * Accessing the pageblock without the zone lock. It could change to @@ -58,16 +57,6 @@ static void __dump_page(struct page *page, const char *reason) int mapcount; char *type = ""; - /* - * If struct page is poisoned don't access Page*() functions as that - * leads to recursive loop. Page*() check for poisoned pages, and calls - * dump_page() when detected. - */ - if (page_poisoned) { - pr_warn("page:%px is uninitialized and poisoned", page); - goto hex_only; - } - if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) { /* * Corrupt page, so we cannot call page_mapping. Instead, do a @@ -173,8 +162,6 @@ out_mapping: pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags, page_cma ? " CMA" : ""); - -hex_only: print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, sizeof(struct page), false); @@ -182,14 +169,16 @@ hex_only: print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), head, sizeof(struct page), false); - - if (reason) - pr_warn("page dumped because: %s\n", reason); } void dump_page(struct page *page, const char *reason) { - __dump_page(page, reason); + if (PagePoisoned(page)) + pr_warn("page:%p is uninitialized and poisoned", page); + else + __dump_page(page); + if (reason) + pr_warn("page dumped because: %s\n", reason); dump_page_owner(page); } EXPORT_SYMBOL(dump_page); -- cgit From 8bf6f451bded5db7840b3b2932ef48be5dce6b38 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:16 -0700 Subject: mm/page_owner: constify dump_page_owner dump_page_owner() only uses struct page to find the page_ext, and lookup_page_ext() already takes a const argument. Link: https://lkml.kernel.org/r/20210416231531.2521383-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_owner.h | 6 +++--- mm/page_owner.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 3468794f83d2..719bfe5108c5 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h @@ -14,7 +14,7 @@ extern void __set_page_owner(struct page *page, extern void __split_page_owner(struct page *page, unsigned int nr); extern void __copy_page_owner(struct page *oldpage, struct page *newpage); extern void __set_page_owner_migrate_reason(struct page *page, int reason); -extern void __dump_page_owner(struct page *page); +extern void __dump_page_owner(const struct page *page); extern void pagetypeinfo_showmixedcount_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone); @@ -46,7 +46,7 @@ static inline void set_page_owner_migrate_reason(struct page *page, int reason) if (static_branch_unlikely(&page_owner_inited)) __set_page_owner_migrate_reason(page, reason); } -static inline void dump_page_owner(struct page *page) +static inline void dump_page_owner(const struct page *page) { if (static_branch_unlikely(&page_owner_inited)) __dump_page_owner(page); @@ -69,7 +69,7 @@ static inline void copy_page_owner(struct page *oldpage, struct page *newpage) static inline void set_page_owner_migrate_reason(struct page *page, int reason) { } -static inline void dump_page_owner(struct page *page) +static inline void dump_page_owner(const struct page *page) { } #endif /* CONFIG_PAGE_OWNER */ diff --git a/mm/page_owner.c b/mm/page_owner.c index adfabb560eb9..f51a57e92aa3 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -392,7 +392,7 @@ err: return -ENOMEM; } -void __dump_page_owner(struct page *page) +void __dump_page_owner(const struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); struct page_owner *page_owner; -- cgit From 0f2317e34e2c7b97efd4600122115410795ebeea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:19 -0700 Subject: mm: make compound_head const-preserving If you pass a const pointer to compound_head(), you get a const pointer back; if you pass a mutable pointer, you get a mutable pointer back. Also remove an unnecessary forward definition of struct page; we're about to dereference page->compound_head, so it must already have been defined. Link: https://lkml.kernel.org/r/20210416231531.2521383-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 04a34c08e0a6..d8e26243db25 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -177,17 +177,17 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H -struct page; /* forward declaration */ - -static inline struct page *compound_head(struct page *page) +static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) - return (struct page *) (head - 1); - return page; + return head - 1; + return (unsigned long)page; } +#define compound_head(page) ((typeof(page))_compound_head(page)) + static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1; -- cgit From ca891f41c4c7921a03dfd0fa1faf324393724480 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:22 -0700 Subject: mm: constify get_pfnblock_flags_mask and get_pfnblock_migratetype The struct page is not modified by these routines, so it can be marked const. Link: https://lkml.kernel.org/r/20210416231531.2521383-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pageblock-flags.h | 2 +- mm/page_alloc.c | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index fff52ad370c1..973fd731a520 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -54,7 +54,7 @@ extern unsigned int pageblock_order; /* Forward declaration */ struct page; -unsigned long get_pfnblock_flags_mask(struct page *page, +unsigned long get_pfnblock_flags_mask(const struct page *page, unsigned long pfn, unsigned long mask); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ea1efbb06e40..4f5eedb6593a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -474,7 +474,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) #endif /* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct page *page, +static inline unsigned long *get_pageblock_bitmap(const struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM @@ -484,7 +484,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page, #endif /* CONFIG_SPARSEMEM */ } -static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) +static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); @@ -495,7 +495,7 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) } static __always_inline -unsigned long __get_pfnblock_flags_mask(struct page *page, +unsigned long __get_pfnblock_flags_mask(const struct page *page, unsigned long pfn, unsigned long mask) { @@ -520,13 +520,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page, * * Return: pageblock_bits flags */ -unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long mask) +unsigned long get_pfnblock_flags_mask(const struct page *page, + unsigned long pfn, unsigned long mask) { return __get_pfnblock_flags_mask(page, pfn, mask); } -static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) +static __always_inline int get_pfnblock_migratetype(const struct page *page, + unsigned long pfn) { return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); } -- cgit From 5f7dadf3958f882b393d3c4c60da232dbac66424 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:25 -0700 Subject: mm: constify page_count and page_ref_count Now that compound_head() accepts a const struct page pointer, these two functions can be marked as not modifying the page pointer they are passed. Link: https://lkml.kernel.org/r/20210416231531.2521383-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Vlastimil Babka Reviewed-by: Anshuman Khandual Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_ref.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index f3318f34fc54..7ad46f45df39 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -62,12 +62,12 @@ static inline void __page_ref_unfreeze(struct page *page, int v) #endif -static inline int page_ref_count(struct page *page) +static inline int page_ref_count(const struct page *page) { return atomic_read(&page->_refcount); } -static inline int page_count(struct page *page) +static inline int page_count(const struct page *page) { return atomic_read(&compound_head(page)->_refcount); } -- cgit From 1cfcee728391ece94a75e4b17fa87253d40c2185 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 28 Jun 2021 19:41:28 -0700 Subject: mm: optimise nth_page for contiguous memmap If the memmap is virtually contiguous (either because we're using a virtually mapped memmap or because we don't support a discontig memmap at all), then we can implement nth_page() by simple addition. Contrary to popular belief, the compiler is not able to optimise this itself for a vmemmap configuration. This reduces one example user (sg.c) by four instructions: struct page *page = nth_page(rsv_schp->pages[k], offset >> PAGE_SHIFT); before: 49 8b 45 70 mov 0x70(%r13),%rax 48 63 c9 movslq %ecx,%rcx 48 c1 eb 0c shr $0xc,%rbx 48 8b 04 c8 mov (%rax,%rcx,8),%rax 48 2b 05 00 00 00 00 sub 0x0(%rip),%rax R_X86_64_PC32 vmemmap_base-0x4 48 c1 f8 06 sar $0x6,%rax 48 01 d8 add %rbx,%rax 48 c1 e0 06 shl $0x6,%rax 48 03 05 00 00 00 00 add 0x0(%rip),%rax R_X86_64_PC32 vmemmap_base-0x4 after: 49 8b 45 70 mov 0x70(%r13),%rax 48 63 c9 movslq %ecx,%rcx 48 c1 eb 0c shr $0xc,%rbx 48 c1 e3 06 shl $0x6,%rbx 48 03 1c c8 add (%rax,%rcx,8),%rbx Link: https://lkml.kernel.org/r/20210413194625.1472345-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Tejun Heo Cc: FUJITA Tomonori Cc: Douglas Gilbert Cc: Chris Wilson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index f08e9de92fc5..9bd21e6fad6a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -234,7 +234,11 @@ int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp, void **shadowp); +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) +#else +#define nth_page(page,n) ((page) + (n)) +#endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) -- cgit From 9660ecaa79ce5c068aa3138ca7e29a9402f284ed Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 28 Jun 2021 19:41:31 -0700 Subject: mm/page_alloc: switch to pr_debug Having such debug messages in the dmesg log may confuse users. Therefore restrict debug output to cases where DEBUG is defined or dynamic debugging is enabled for the respective code piece. Link: https://lkml.kernel.org/r/976adb93-3041-ce63-48fc-55a6096a51c1@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f5eedb6593a..902f889a324d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6770,9 +6770,8 @@ static __meminit void zone_pcp_init(struct zone *zone) zone->pageset_batch = BOOT_PAGESET_BATCH; if (populated_zone(zone)) - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", - zone->name, zone->present_pages, - zone_batchsize(zone)); + pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, + zone->present_pages, zone_batchsize(zone)); } void __meminit init_currently_empty_zone(struct zone *zone, @@ -7042,8 +7041,7 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, pgdat->node_spanned_pages = totalpages; pgdat->node_present_pages = realtotalpages; - printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, - realtotalpages); + pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } #ifndef CONFIG_SPARSEMEM @@ -7243,9 +7241,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat) if (freesize >= memmap_pages) { freesize -= memmap_pages; if (memmap_pages) - printk(KERN_DEBUG - " %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); + pr_debug(" %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); } else pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); @@ -7254,8 +7251,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) /* Account for reserved pages */ if (j == 0 && freesize > dma_reserve) { freesize -= dma_reserve; - printk(KERN_DEBUG " %s zone: %lu pages reserved\n", - zone_names[0], dma_reserve); + pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } if (!is_highmem_idx(j)) -- cgit From a0b8200d06ad6450c179407baa5f0f52f8cfcc97 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 28 Jun 2021 19:41:34 -0700 Subject: kbuild: skip per-CPU BTF generation for pahole v1.18-v1.21 Commit "mm/page_alloc: convert per-cpu list protection to local_lock" will introduce a zero-sized per-CPU variable, which causes pahole to generate invalid BTF. Only pahole versions 1.18 through 1.21 are impacted, as before 1.18 pahole doesn't know anything about per-CPU variables, and 1.22 contains the proper fix for the issue. Luckily, pahole 1.18 got --skip_encoding_btf_vars option disabling BTF generation for per-CPU variables in anticipation of some unanticipated problems. So use this escape hatch to disable per-CPU var BTF info on those problematic pahole versions. Users relying on availability of per-CPU var BTFs would need to upgrade to pahole 1.22+, but everyone won't notice any regressions. Link: https://lkml.kernel.org/r/20210530002536.3193829-1-andrii@kernel.org Signed-off-by: Andrii Nakryiko Acked-by: Mel Gorman Cc: Arnaldo Carvalho de Melo Cc: Hao Luo Cc: Michal Suchanek Cc: Jiri Olsa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/link-vmlinux.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 0e0f6466b18d..475faa15854e 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -235,6 +235,10 @@ gen_btf() vmlinux_link ${1} + if [ "${pahole_ver}" -ge "118" ] && [ "${pahole_ver}" -le "121" ]; then + # pahole 1.18 through 1.21 can't handle zero-sized per-CPU vars + extra_paholeopt="${extra_paholeopt} --skip_encoding_btf_vars" + fi if [ "${pahole_ver}" -ge "121" ]; then extra_paholeopt="${extra_paholeopt} --btf_gen_floats" fi -- cgit From 28f836b6777b6f42dce068a40d83a891deaaca37 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:38 -0700 Subject: mm/page_alloc: split per cpu page lists and zone stats The PCP (per-cpu page allocator in page_alloc.c) shares locking requirements with vmstat and the zone lock which is inconvenient and causes some issues. For example, the PCP list and vmstat share the same per-cpu space meaning that it's possible that vmstat updates dirty cache lines holding per-cpu lists across CPUs unless padding is used. Second, PREEMPT_RT does not want to disable IRQs for too long in the page allocator. This series splits the locking requirements and uses locks types more suitable for PREEMPT_RT, reduces the time when special locking is required for stats and reduces the time when IRQs need to be disabled on !PREEMPT_RT kernels. Why local_lock? PREEMPT_RT considers the following sequence to be unsafe as documented in Documentation/locking/locktypes.rst local_irq_disable(); spin_lock(&lock); The pcp allocator has this sequence for rmqueue_pcplist (local_irq_save) -> __rmqueue_pcplist -> rmqueue_bulk (spin_lock). While it's possible to separate this out, it generally means there are points where we enable IRQs and reenable them again immediately. To prevent a migration and the per-cpu pointer going stale, migrate_disable is also needed. That is a custom lock that is similar, but worse, than local_lock. Furthermore, on PREEMPT_RT, it's undesirable to leave IRQs disabled for too long. By converting to local_lock which disables migration on PREEMPT_RT, the locking requirements can be separated and start moving the protections for PCP, stats and the zone lock to PREEMPT_RT-safe equivalent locking. As a bonus, local_lock also means that PROVE_LOCKING does something useful. After that, it's obvious that zone_statistics incurs too much overhead and leaves IRQs disabled for longer than necessary on !PREEMPT_RT kernels. zone_statistics uses perfectly accurate counters requiring IRQs be disabled for parallel RMW sequences when inaccurate ones like vm_events would do. The series makes the NUMA statistics (NUMA_HIT and friends) inaccurate counters that then require no special protection on !PREEMPT_RT. The bulk page allocator can then do stat updates in bulk with IRQs enabled which should improve the efficiency. Technically, this could have been done without the local_lock and vmstat conversion work and the order simply reflects the timing of when different series were implemented. Finally, there are places where we conflate IRQs being disabled for the PCP with the IRQ-safe zone spinlock. The remainder of the series reduces the scope of what is protected by disabled IRQs on !PREEMPT_RT kernels. By the end of the series, page_alloc.c does not call local_irq_save so the locking scope is a bit clearer. The one exception is that modifying NR_FREE_PAGES still happens in places where it's known the IRQs are disabled as it's harmless for PREEMPT_RT and would be expensive to split the locking there. No performance data is included because despite the overhead of the stats, it's within the noise for most workloads on !PREEMPT_RT. However, Jesper Dangaard Brouer ran a page allocation microbenchmark on a E5-1650 v4 @ 3.60GHz CPU on the first version of this series. Focusing on the array variant of the bulk page allocator reveals the following. (CPU: Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz) ARRAY variant: time_bulk_page_alloc_free_array: step=bulk size Baseline Patched 1 56.383 54.225 (+3.83%) 2 40.047 35.492 (+11.38%) 3 37.339 32.643 (+12.58%) 4 35.578 30.992 (+12.89%) 8 33.592 29.606 (+11.87%) 16 32.362 28.532 (+11.85%) 32 31.476 27.728 (+11.91%) 64 30.633 27.252 (+11.04%) 128 30.596 27.090 (+11.46%) While this is a positive outcome, the series is more likely to be interesting to the RT people in terms of getting parts of the PREEMPT_RT tree into mainline. This patch (of 9): The per-cpu page allocator lists and the per-cpu vmstat deltas are stored in the same struct per_cpu_pages even though vmstats have no direct impact on the per-cpu page lists. This is inconsistent because the vmstats for a node are stored on a dedicated structure. The bigger issue is that the per_cpu_pages structure is not cache-aligned and stat updates either cache conflict with adjacent per-cpu lists incurring a runtime cost or padding is required incurring a memory cost. This patch splits the per-cpu pagelists and the vmstat deltas into separate structures. It's mostly a mechanical conversion but some variable renaming is done to clearly distinguish the per-cpu pages structure (pcp) from the vmstats (pzstats). Superficially, this appears to increase the size of the per_cpu_pages structure but the movement of expire fills a structure hole so there is no impact overall. [mgorman@techsingularity.net: make it W=1 cleaner] Link: https://lkml.kernel.org/r/20210514144622.GA3735@techsingularity.net [mgorman@techsingularity.net: make it W=1 even cleaner] Link: https://lkml.kernel.org/r/20210516140705.GB3735@techsingularity.net [lkp@intel.com: check struct per_cpu_zonestat has a non-zero size] [vbabka@suse.cz: Init zone->per_cpu_zonestats properly] Link: https://lkml.kernel.org/r/20210512095458.30632-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210512095458.30632-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Jesper Dangaard Brouer Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Ingo Molnar Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 18 +++++----- include/linux/vmstat.h | 8 ++--- mm/page_alloc.c | 85 +++++++++++++++++++++++-------------------- mm/vmstat.c | 98 ++++++++++++++++++++++++++------------------------ 4 files changed, 113 insertions(+), 96 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c2bfefd34b59..a50b123ab7ae 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -341,20 +341,21 @@ struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ +#ifdef CONFIG_NUMA + int expire; /* When 0, remote pagesets are drained */ +#endif /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[MIGRATE_PCPTYPES]; }; -struct per_cpu_pageset { - struct per_cpu_pages pcp; -#ifdef CONFIG_NUMA - s8 expire; - u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; -#endif +struct per_cpu_zonestat { #ifdef CONFIG_SMP - s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; + s8 stat_threshold; +#endif +#ifdef CONFIG_NUMA + u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif }; @@ -484,7 +485,8 @@ struct zone { int node; #endif struct pglist_data *zone_pgdat; - struct per_cpu_pageset __percpu *pageset; + struct per_cpu_pages __percpu *per_cpu_pageset; + struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 3299cd69e4ca..0c5f36504613 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct zone *zone, int cpu; for_each_online_cpu(cpu) - x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; + x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item]; return x; } @@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) - x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item]; + x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item]; if (x < 0) x = 0; @@ -291,7 +291,7 @@ struct ctl_table; int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, loff_t *ppos); -void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); +void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); @@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { } static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, - struct per_cpu_pageset *pset) { } + struct per_cpu_zonestat *pzstats) { } #endif /* CONFIG_SMP */ static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 902f889a324d..330c7307a92b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3026,15 +3026,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { unsigned long flags; - struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; local_irq_save(flags); - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); + local_irq_restore(flags); } @@ -3133,7 +3132,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) * disables preemption as part of its processing */ for_each_online_cpu(cpu) { - struct per_cpu_pageset *pcp; + struct per_cpu_pages *pcp; struct zone *z; bool has_pcps = false; @@ -3144,13 +3143,13 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) */ has_pcps = true; } else if (zone) { - pcp = per_cpu_ptr(zone->pageset, cpu); - if (pcp->pcp.count) + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + if (pcp->count) has_pcps = true; } else { for_each_populated_zone(z) { - pcp = per_cpu_ptr(z->pageset, cpu); - if (pcp->pcp.count) { + pcp = per_cpu_ptr(z->per_cpu_pageset, cpu); + if (pcp->count) { has_pcps = true; break; } @@ -3280,7 +3279,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) migratetype = MIGRATE_MOVABLE; } - pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= READ_ONCE(pcp->high)) @@ -3496,7 +3495,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, unsigned long flags; local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp = this_cpu_ptr(zone->per_cpu_pageset); list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); if (page) { @@ -5105,7 +5104,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Attempt the batch allocation */ local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_list = &pcp->lists[ac.migratetype]; while (nr_populated < nr_pages) { @@ -5720,7 +5719,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) continue; for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" @@ -5812,7 +5811,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) free_pcp = 0; for_each_online_cpu(cpu) - free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; show_node(zone); printk(KERN_CONT @@ -5853,7 +5852,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), - K(this_cpu_read(zone->pageset->pcp.count)), + K(this_cpu_read(zone->per_cpu_pageset->count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES))); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -6180,11 +6179,12 @@ static void build_zonelists(pg_data_t *pgdat) * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ -static void pageset_init(struct per_cpu_pageset *p); +static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats); /* These effectively disable the pcplists in the boot pageset completely */ #define BOOT_PAGESET_HIGH 0 #define BOOT_PAGESET_BATCH 1 -static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) @@ -6251,7 +6251,7 @@ build_all_zonelists_init(void) * (a chicken-egg dilemma). */ for_each_possible_cpu(cpu) - pageset_init(&per_cpu(boot_pageset, cpu)); + per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu)); mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); @@ -6650,14 +6650,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, WRITE_ONCE(pcp->high, high); } -static void pageset_init(struct per_cpu_pageset *p) +static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) { - struct per_cpu_pages *pcp; int migratetype; - memset(p, 0, sizeof(*p)); + memset(pcp, 0, sizeof(*pcp)); + memset(pzstats, 0, sizeof(*pzstats)); - pcp = &p->pcp; for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); @@ -6674,12 +6673,12 @@ static void pageset_init(struct per_cpu_pageset *p) static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, unsigned long batch) { - struct per_cpu_pageset *p; + struct per_cpu_pages *pcp; int cpu; for_each_possible_cpu(cpu) { - p = per_cpu_ptr(zone->pageset, cpu); - pageset_update(&p->pcp, high, batch); + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + pageset_update(pcp, high, batch); } } @@ -6714,13 +6713,20 @@ static void zone_set_pageset_high_and_batch(struct zone *zone) void __meminit setup_zone_pageset(struct zone *zone) { - struct per_cpu_pageset *p; int cpu; - zone->pageset = alloc_percpu(struct per_cpu_pageset); + /* Size may be 0 on !SMP && !NUMA */ + if (sizeof(struct per_cpu_zonestat) > 0) + zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat); + + zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages); for_each_possible_cpu(cpu) { - p = per_cpu_ptr(zone->pageset, cpu); - pageset_init(p); + struct per_cpu_pages *pcp; + struct per_cpu_zonestat *pzstats; + + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + per_cpu_pages_init(pcp, pzstats); } zone_set_pageset_high_and_batch(zone); @@ -6747,9 +6753,9 @@ void __init setup_per_cpu_pageset(void) * the nodes these zones are associated with. */ for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu); - memset(pcp->vm_numa_stat_diff, 0, - sizeof(pcp->vm_numa_stat_diff)); + struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); + memset(pzstats->vm_numa_stat_diff, 0, + sizeof(pzstats->vm_numa_stat_diff)); } #endif @@ -6765,7 +6771,8 @@ static __meminit void zone_pcp_init(struct zone *zone) * relies on the ability of the linker to provide the * offset of a (static) per cpu variable into the per cpu area. */ - zone->pageset = &boot_pageset; + zone->per_cpu_pageset = &boot_pageset; + zone->per_cpu_zonestats = &boot_zonestats; zone->pageset_high = BOOT_PAGESET_HIGH; zone->pageset_batch = BOOT_PAGESET_BATCH; @@ -9046,15 +9053,17 @@ void zone_pcp_enable(struct zone *zone) void zone_pcp_reset(struct zone *zone) { int cpu; - struct per_cpu_pageset *pset; + struct per_cpu_zonestat *pzstats; - if (zone->pageset != &boot_pageset) { + if (zone->per_cpu_pageset != &boot_pageset) { for_each_online_cpu(cpu) { - pset = per_cpu_ptr(zone->pageset, cpu); - drain_zonestat(zone, pset); + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + drain_zonestat(zone, pzstats); } - free_percpu(zone->pageset); - zone->pageset = &boot_pageset; + free_percpu(zone->per_cpu_pageset); + free_percpu(zone->per_cpu_zonestats); + zone->per_cpu_pageset = &boot_pageset; + zone->per_cpu_zonestats = &boot_zonestats; } } diff --git a/mm/vmstat.c b/mm/vmstat.c index cccee36b289c..f1400ba46beb 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -44,7 +44,7 @@ static void zero_zone_numa_counters(struct zone *zone) for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { atomic_long_set(&zone->vm_numa_stat[item], 0); for_each_online_cpu(cpu) - per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item] = 0; } } @@ -266,7 +266,7 @@ void refresh_zone_stat_thresholds(void) for_each_online_cpu(cpu) { int pgdat_threshold; - per_cpu_ptr(zone->pageset, cpu)->stat_threshold + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold = threshold; /* Base nodestat threshold on the largest populated zone. */ @@ -303,7 +303,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, threshold = (*calculate_pressure)(zone); for_each_online_cpu(cpu) - per_cpu_ptr(zone->pageset, cpu)->stat_threshold + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold = threshold; } } @@ -316,7 +316,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; long x; long t; @@ -389,7 +389,7 @@ EXPORT_SYMBOL(__mod_node_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; @@ -435,7 +435,7 @@ EXPORT_SYMBOL(__inc_node_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; @@ -495,7 +495,7 @@ EXPORT_SYMBOL(__dec_node_page_state); static inline void mod_zone_state(struct zone *zone, enum zone_stat_item item, long delta, int overstep_mode) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; long o, n, t, z; @@ -781,19 +781,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets) int changes = 0; for_each_populated_zone(zone) { - struct per_cpu_pageset __percpu *p = zone->pageset; + struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; +#ifdef CONFIG_NUMA + struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset; +#endif for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { int v; - v = this_cpu_xchg(p->vm_stat_diff[i], 0); + v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0); if (v) { atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ - __this_cpu_write(p->expire, 3); + __this_cpu_write(pcp->expire, 3); #endif } } @@ -801,12 +804,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets) for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { int v; - v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0); + v = this_cpu_xchg(pzstats->vm_numa_stat_diff[i], 0); if (v) { atomic_long_add(v, &zone->vm_numa_stat[i]); global_numa_diff[i] += v; - __this_cpu_write(p->expire, 3); + __this_cpu_write(pcp->expire, 3); } } @@ -819,23 +822,23 @@ static int refresh_cpu_vm_stats(bool do_pagesets) * Check if there are pages remaining in this pageset * if not then there is nothing to expire. */ - if (!__this_cpu_read(p->expire) || - !__this_cpu_read(p->pcp.count)) + if (!__this_cpu_read(pcp->expire) || + !__this_cpu_read(pcp->count)) continue; /* * We never drain zones local to this processor. */ if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); + __this_cpu_write(pcp->expire, 0); continue; } - if (__this_cpu_dec_return(p->expire)) + if (__this_cpu_dec_return(pcp->expire)) continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + if (__this_cpu_read(pcp->count)) { + drain_zone_pages(zone, this_cpu_ptr(pcp)); changes++; } } @@ -882,27 +885,27 @@ void cpu_vm_stats_fold(int cpu) int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { - struct per_cpu_pageset *p; + struct per_cpu_zonestat *pzstats; - p = per_cpu_ptr(zone->pageset, cpu); + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (p->vm_stat_diff[i]) { + if (pzstats->vm_stat_diff[i]) { int v; - v = p->vm_stat_diff[i]; - p->vm_stat_diff[i] = 0; + v = pzstats->vm_stat_diff[i]; + pzstats->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; } #ifdef CONFIG_NUMA for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (p->vm_numa_stat_diff[i]) { + if (pzstats->vm_numa_stat_diff[i]) { int v; - v = p->vm_numa_stat_diff[i]; - p->vm_numa_stat_diff[i] = 0; + v = pzstats->vm_numa_stat_diff[i]; + pzstats->vm_numa_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_numa_stat[i]); global_numa_diff[i] += v; } @@ -936,24 +939,24 @@ void cpu_vm_stats_fold(int cpu) * this is only called if !populated_zone(zone), which implies no other users of * pset->vm_stat_diff[] exist. */ -void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) +void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { int i; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (pset->vm_stat_diff[i]) { - int v = pset->vm_stat_diff[i]; - pset->vm_stat_diff[i] = 0; + if (pzstats->vm_stat_diff[i]) { + int v = pzstats->vm_stat_diff[i]; + pzstats->vm_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_stat[i]); atomic_long_add(v, &vm_zone_stat[i]); } #ifdef CONFIG_NUMA for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (pset->vm_numa_stat_diff[i]) { - int v = pset->vm_numa_stat_diff[i]; + if (pzstats->vm_numa_stat_diff[i]) { + int v = pzstats->vm_numa_stat_diff[i]; - pset->vm_numa_stat_diff[i] = 0; + pzstats->vm_numa_stat_diff[i] = 0; atomic_long_add(v, &zone->vm_numa_stat[i]); atomic_long_add(v, &vm_numa_stat[i]); } @@ -965,8 +968,8 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) void __inc_numa_state(struct zone *zone, enum numa_stat_item item) { - struct per_cpu_pageset __percpu *pcp = zone->pageset; - u16 __percpu *p = pcp->vm_numa_stat_diff + item; + struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; + u16 __percpu *p = pzstats->vm_numa_stat_diff + item; u16 v; v = __this_cpu_inc_return(*p); @@ -1693,21 +1696,23 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { - struct per_cpu_pageset *pageset; + struct per_cpu_pages *pcp; + struct per_cpu_zonestat __maybe_unused *pzstats; - pageset = per_cpu_ptr(zone->pageset, i); + pcp = per_cpu_ptr(zone->per_cpu_pageset, i); seq_printf(m, "\n cpu: %i" "\n count: %i" "\n high: %i" "\n batch: %i", i, - pageset->pcp.count, - pageset->pcp.high, - pageset->pcp.batch); + pcp->count, + pcp->high, + pcp->batch); #ifdef CONFIG_SMP + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i); seq_printf(m, "\n vm stats threshold: %d", - pageset->stat_threshold); + pzstats->stat_threshold); #endif } seq_printf(m, @@ -1927,17 +1932,18 @@ static bool need_update(int cpu) struct zone *zone; for_each_populated_zone(zone) { - struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); + struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); struct per_cpu_nodestat *n; + /* * The fast way of checking if there are any vmstat diffs. */ - if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * - sizeof(p->vm_stat_diff[0]))) + if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * + sizeof(pzstats->vm_stat_diff[0]))) return true; #ifdef CONFIG_NUMA - if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * - sizeof(p->vm_numa_stat_diff[0]))) + if (memchr_inv(pzstats->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * + sizeof(pzstats->vm_numa_stat_diff[0]))) return true; #endif if (last_pgdat == zone->zone_pgdat) -- cgit From dbbee9d5cd83f9d0a29639e260516907ceb2ac3d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:41 -0700 Subject: mm/page_alloc: convert per-cpu list protection to local_lock There is a lack of clarity of what exactly local_irq_save/local_irq_restore protects in page_alloc.c . It conflates the protection of per-cpu page allocation structures with per-cpu vmstat deltas. This patch protects the PCP structure using local_lock which for most configurations is identical to IRQ enabling/disabling. The scope of the lock is still wider than it should be but this is decreased later. It is possible for the local_lock to be embedded safely within struct per_cpu_pages but it adds complexity to free_unref_page_list. [akpm@linux-foundation.org: coding style fixes] [mgorman@techsingularity.net: work around a pahole limitation with zero-sized struct pagesets] Link: https://lkml.kernel.org/r/20210526080741.GW30378@techsingularity.net [lkp@intel.com: Make pagesets static] Link: https://lkml.kernel.org/r/20210512095458.30632-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 ++ lib/Kconfig.debug | 3 +++ mm/page_alloc.c | 61 +++++++++++++++++++++++++++++++++++++------------- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a50b123ab7ae..0d6bb737e5a2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -20,6 +20,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -337,6 +338,7 @@ enum zone_watermarks { #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) +/* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7723f58a9394..deca67d28abb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -313,6 +313,9 @@ config DEBUG_INFO_BTF config PAHOLE_HAS_SPLIT_BTF def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "119") +config PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT + def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "122") + config DEBUG_INFO_BTF_MODULES def_bool y depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 330c7307a92b..89872ad5e872 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -122,6 +122,24 @@ typedef int __bitwise fpi_t; static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) +struct pagesets { + local_lock_t lock; +#if defined(CONFIG_DEBUG_INFO_BTF) && \ + !defined(CONFIG_DEBUG_LOCK_ALLOC) && \ + !defined(CONFIG_PAHOLE_HAS_ZEROSIZE_PERCPU_SUPPORT) + /* + * pahole 1.21 and earlier gets confused by zero-sized per-CPU + * variables and produces invalid BTF. Ensure that + * sizeof(struct pagesets) != 0 for older versions of pahole. + */ + char __pahole_hack; + #warning "pahole too old to support zero-sized struct pagesets" +#endif +}; +static DEFINE_PER_CPU(struct pagesets, pagesets) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -1453,6 +1471,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, } while (--count && --batch_free && !list_empty(list)); } + /* + * local_lock_irq held so equivalent to spin_lock_irqsave for + * both PREEMPT_RT and non-PREEMPT_RT configurations. + */ spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); @@ -1573,6 +1595,11 @@ static void __free_pages_ok(struct page *page, unsigned int order, return; migratetype = get_pfnblock_migratetype(page, pfn); + + /* + * TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock + * and protect vmstat updates. + */ local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, pfn, order, migratetype, @@ -2955,6 +2982,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, { int i, allocated = 0; + /* + * local_lock_irq held so equivalent to spin_lock_irqsave for + * both PREEMPT_RT and non-PREEMPT_RT configurations. + */ spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, @@ -3007,12 +3038,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) unsigned long flags; int to_drain, batch; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) free_pcppages_bulk(zone, to_drain, pcp); - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } #endif @@ -3028,13 +3059,13 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) unsigned long flags; struct per_cpu_pages *pcp; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3297,9 +3328,9 @@ void free_unref_page(struct page *page) if (!free_unref_page_prepare(page, pfn)) return; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); free_unref_page_commit(page, pfn); - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3319,7 +3350,7 @@ void free_unref_page_list(struct list_head *list) set_page_private(page, pfn); } - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { unsigned long pfn = page_private(page); @@ -3332,12 +3363,12 @@ void free_unref_page_list(struct list_head *list) * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); batch_count = 0; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); } } - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3494,7 +3525,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct page *page; unsigned long flags; - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); @@ -3502,7 +3533,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); } - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); return page; } @@ -5103,7 +5134,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto failed; /* Attempt the batch allocation */ - local_irq_save(flags); + local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_list = &pcp->lists[ac.migratetype]; @@ -5141,12 +5172,12 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); return nr_populated; failed_irq: - local_irq_restore(flags); + local_unlock_irqrestore(&pagesets.lock, flags); failed: page = __alloc_pages(gfp, 0, preferred_nid, nodemask); -- cgit From f19298b9516c1a031b34b4147773457e3efe743b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:44 -0700 Subject: mm/vmstat: convert NUMA statistics to basic NUMA counters NUMA statistics are maintained on the zone level for hits, misses, foreign etc but nothing relies on them being perfectly accurate for functional correctness. The counters are used by userspace to get a general overview of a workloads NUMA behaviour but the page allocator incurs a high cost to maintain perfect accuracy similar to what is required for a vmstat like NR_FREE_PAGES. There even is a sysctl vm.numa_stat to allow userspace to turn off the collection of NUMA statistics like NUMA_HIT. This patch converts NUMA_HIT and friends to be NUMA events with similar accuracy to VM events. There is a possibility that slight errors will be introduced but the overall trend as seen by userspace will be similar. The counters are no longer updated from vmstat_refresh context as it is unnecessary overhead for counters that may never be read by userspace. Note that counters could be maintained at the node level to save space but it would have a user-visible impact due to /proc/zoneinfo. [lkp@intel.com: Fix misplaced closing brace for !CONFIG_NUMA] Link: https://lkml.kernel.org/r/20210512095458.30632-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 18 ++--- include/linux/mmzone.h | 13 ++-- include/linux/vmstat.h | 43 ++++++------ mm/mempolicy.c | 2 +- mm/page_alloc.c | 12 ++-- mm/vmstat.c | 173 +++++++++++++++++++------------------------------ 6 files changed, 113 insertions(+), 148 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 2c36f61d30bc..9db297431b97 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -482,6 +482,7 @@ static DEVICE_ATTR(meminfo, 0444, node_read_meminfo, NULL); static ssize_t node_read_numastat(struct device *dev, struct device_attribute *attr, char *buf) { + fold_vm_numa_events(); return sysfs_emit(buf, "numa_hit %lu\n" "numa_miss %lu\n" @@ -489,12 +490,12 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_numa_state(dev->id, NUMA_HIT), - sum_zone_numa_state(dev->id, NUMA_MISS), - sum_zone_numa_state(dev->id, NUMA_FOREIGN), - sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_numa_state(dev->id, NUMA_LOCAL), - sum_zone_numa_state(dev->id, NUMA_OTHER)); + sum_zone_numa_event_state(dev->id, NUMA_HIT), + sum_zone_numa_event_state(dev->id, NUMA_MISS), + sum_zone_numa_event_state(dev->id, NUMA_FOREIGN), + sum_zone_numa_event_state(dev->id, NUMA_INTERLEAVE_HIT), + sum_zone_numa_event_state(dev->id, NUMA_LOCAL), + sum_zone_numa_event_state(dev->id, NUMA_OTHER)); } static DEVICE_ATTR(numastat, 0444, node_read_numastat, NULL); @@ -512,10 +513,11 @@ static ssize_t node_read_vmstat(struct device *dev, sum_zone_node_page_state(nid, i)); #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + fold_vm_numa_events(); + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) len += sysfs_emit_at(buf, len, "%s %lu\n", numa_stat_name(i), - sum_zone_numa_state(nid, i)); + sum_zone_numa_event_state(nid, i)); #endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0d6bb737e5a2..f86018d5e362 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -135,10 +135,10 @@ enum numa_stat_item { NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ - NR_VM_NUMA_STAT_ITEMS + NR_VM_NUMA_EVENT_ITEMS }; #else -#define NR_VM_NUMA_STAT_ITEMS 0 +#define NR_VM_NUMA_EVENT_ITEMS 0 #endif enum zone_stat_item { @@ -357,7 +357,12 @@ struct per_cpu_zonestat { s8 stat_threshold; #endif #ifdef CONFIG_NUMA - u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; + /* + * Low priority inaccurate counters that are only folded + * on demand. Use a large type to avoid the overhead of + * folding during refresh_cpu_vm_stats. + */ + unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #endif }; @@ -623,7 +628,7 @@ struct zone { ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; - atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; + atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 0c5f36504613..59748bbbba4c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -138,34 +138,27 @@ static inline void vm_events_fold_cpu(int cpu) * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; -extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; +extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; #ifdef CONFIG_NUMA -static inline void zone_numa_state_add(long x, struct zone *zone, - enum numa_stat_item item) +static inline void zone_numa_event_add(long x, struct zone *zone, + enum numa_stat_item item) { - atomic_long_add(x, &zone->vm_numa_stat[item]); - atomic_long_add(x, &vm_numa_stat[item]); + atomic_long_add(x, &zone->vm_numa_event[item]); + atomic_long_add(x, &vm_numa_event[item]); } -static inline unsigned long global_numa_state(enum numa_stat_item item) +static inline unsigned long zone_numa_event_state(struct zone *zone, + enum numa_stat_item item) { - long x = atomic_long_read(&vm_numa_stat[item]); - - return x; + return atomic_long_read(&zone->vm_numa_event[item]); } -static inline unsigned long zone_numa_state_snapshot(struct zone *zone, - enum numa_stat_item item) +static inline unsigned long +global_numa_event_state(enum numa_stat_item item) { - long x = atomic_long_read(&zone->vm_numa_stat[item]); - int cpu; - - for_each_online_cpu(cpu) - x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item]; - - return x; + return atomic_long_read(&vm_numa_event[item]); } #endif /* CONFIG_NUMA */ @@ -245,18 +238,22 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, } #ifdef CONFIG_NUMA -extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); +extern void __count_numa_event(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); -extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); +extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); extern unsigned long node_page_state_pages(struct pglist_data *pgdat, enum node_stat_item item); +extern void fold_vm_numa_events(void); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) #define node_page_state_pages(node, item) global_node_page_state_pages(item) +static inline void fold_vm_numa_events(void) +{ +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP @@ -428,7 +425,7 @@ static inline const char *numa_stat_name(enum numa_stat_item item) static inline const char *node_stat_name(enum node_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_STAT_ITEMS + + NR_VM_NUMA_EVENT_ITEMS + item]; } @@ -440,7 +437,7 @@ static inline const char *lru_list_name(enum lru_list lru) static inline const char *writeback_stat_name(enum writeback_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_STAT_ITEMS + + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + item]; } @@ -449,7 +446,7 @@ static inline const char *writeback_stat_name(enum writeback_stat_item item) static inline const char *vm_event_name(enum vm_event_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + - NR_VM_NUMA_STAT_ITEMS + + NR_VM_NUMA_EVENT_ITEMS + NR_VM_NODE_STAT_ITEMS + NR_VM_WRITEBACK_STAT_ITEMS + item]; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 325771bef5e2..b5d95bf1025d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2150,7 +2150,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, return page; if (page && page_to_nid(page) == nid) { preempt_disable(); - __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } return page; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89872ad5e872..4e03109bdae5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3480,12 +3480,12 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) local_stat = NUMA_OTHER; if (zone_to_nid(z) == zone_to_nid(preferred_zone)) - __inc_numa_state(z, NUMA_HIT); + __count_numa_event(z, NUMA_HIT); else { - __inc_numa_state(z, NUMA_MISS); - __inc_numa_state(preferred_zone, NUMA_FOREIGN); + __count_numa_event(z, NUMA_MISS); + __count_numa_event(preferred_zone, NUMA_FOREIGN); } - __inc_numa_state(z, local_stat); + __count_numa_event(z, local_stat); #endif } @@ -6785,8 +6785,8 @@ void __init setup_per_cpu_pageset(void) */ for_each_possible_cpu(cpu) { struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); - memset(pzstats->vm_numa_stat_diff, 0, - sizeof(pzstats->vm_numa_stat_diff)); + memset(pzstats->vm_numa_event, 0, + sizeof(pzstats->vm_numa_event)); } #endif diff --git a/mm/vmstat.c b/mm/vmstat.c index f1400ba46beb..0e27b62e487d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -31,8 +31,6 @@ #include "internal.h" -#define NUMA_STATS_THRESHOLD (U16_MAX - 2) - #ifdef CONFIG_NUMA int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; @@ -41,11 +39,12 @@ static void zero_zone_numa_counters(struct zone *zone) { int item, cpu; - for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { - atomic_long_set(&zone->vm_numa_stat[item], 0); - for_each_online_cpu(cpu) - per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item] + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) { + atomic_long_set(&zone->vm_numa_event[item], 0); + for_each_online_cpu(cpu) { + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item] = 0; + } } } @@ -63,8 +62,8 @@ static void zero_global_numa_counters(void) { int item; - for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) - atomic_long_set(&vm_numa_stat[item], 0); + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + atomic_long_set(&vm_numa_event[item], 0); } static void invalid_numa_statistics(void) @@ -161,10 +160,9 @@ void vm_events_fold_cpu(int cpu) * vm_stat contains the global counters */ atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; -atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp; atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp; EXPORT_SYMBOL(vm_zone_stat); -EXPORT_SYMBOL(vm_numa_stat); EXPORT_SYMBOL(vm_node_stat); #ifdef CONFIG_SMP @@ -706,8 +704,7 @@ EXPORT_SYMBOL(dec_node_page_state); * Fold a differential into the global counters. * Returns the number of counters updated. */ -#ifdef CONFIG_NUMA -static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) +static int fold_diff(int *zone_diff, int *node_diff) { int i; int changes = 0; @@ -718,12 +715,6 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) changes++; } - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (numa_diff[i]) { - atomic_long_add(numa_diff[i], &vm_numa_stat[i]); - changes++; - } - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) if (node_diff[i]) { atomic_long_add(node_diff[i], &vm_node_stat[i]); @@ -731,26 +722,34 @@ static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) } return changes; } -#else -static int fold_diff(int *zone_diff, int *node_diff) + +#ifdef CONFIG_NUMA +static void fold_vm_zone_numa_events(struct zone *zone) { - int i; - int changes = 0; + unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, }; + int cpu; + enum numa_stat_item item; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (zone_diff[i]) { - atomic_long_add(zone_diff[i], &vm_zone_stat[i]); - changes++; - } + for_each_online_cpu(cpu) { + struct per_cpu_zonestat *pzstats; - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - if (node_diff[i]) { - atomic_long_add(node_diff[i], &vm_node_stat[i]); - changes++; + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0); } - return changes; + + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_event_add(zone_numa_events[item], zone, item); } -#endif /* CONFIG_NUMA */ + +void fold_vm_numa_events(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + fold_vm_zone_numa_events(zone); +} +#endif /* * Update the zone counters for the current cpu. @@ -774,9 +773,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; -#ifdef CONFIG_NUMA - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; -#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; int changes = 0; @@ -801,17 +797,6 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { - int v; - - v = this_cpu_xchg(pzstats->vm_numa_stat_diff[i], 0); - if (v) { - - atomic_long_add(v, &zone->vm_numa_stat[i]); - global_numa_diff[i] += v; - __this_cpu_write(pcp->expire, 3); - } - } if (do_pagesets) { cond_resched(); @@ -859,12 +844,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) } } -#ifdef CONFIG_NUMA - changes += fold_diff(global_zone_diff, global_numa_diff, - global_node_diff); -#else changes += fold_diff(global_zone_diff, global_node_diff); -#endif return changes; } @@ -879,9 +859,6 @@ void cpu_vm_stats_fold(int cpu) struct zone *zone; int i; int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; -#ifdef CONFIG_NUMA - int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; -#endif int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { @@ -889,7 +866,7 @@ void cpu_vm_stats_fold(int cpu) pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { if (pzstats->vm_stat_diff[i]) { int v; @@ -898,17 +875,17 @@ void cpu_vm_stats_fold(int cpu) atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; } - + } #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (pzstats->vm_numa_stat_diff[i]) { - int v; + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) { + if (pzstats->vm_numa_event[i]) { + unsigned long v; - v = pzstats->vm_numa_stat_diff[i]; - pzstats->vm_numa_stat_diff[i] = 0; - atomic_long_add(v, &zone->vm_numa_stat[i]); - global_numa_diff[i] += v; + v = pzstats->vm_numa_event[i]; + pzstats->vm_numa_event[i] = 0; + zone_numa_event_add(v, zone, i); } + } #endif } @@ -928,11 +905,7 @@ void cpu_vm_stats_fold(int cpu) } } -#ifdef CONFIG_NUMA - fold_diff(global_zone_diff, global_numa_diff, global_node_diff); -#else fold_diff(global_zone_diff, global_node_diff); -#endif } /* @@ -941,43 +914,37 @@ void cpu_vm_stats_fold(int cpu) */ void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) { + unsigned long v; int i; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { if (pzstats->vm_stat_diff[i]) { - int v = pzstats->vm_stat_diff[i]; + v = pzstats->vm_stat_diff[i]; pzstats->vm_stat_diff[i] = 0; - atomic_long_add(v, &zone->vm_stat[i]); - atomic_long_add(v, &vm_zone_stat[i]); + zone_page_state_add(v, zone, i); } + } #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - if (pzstats->vm_numa_stat_diff[i]) { - int v = pzstats->vm_numa_stat_diff[i]; - - pzstats->vm_numa_stat_diff[i] = 0; - atomic_long_add(v, &zone->vm_numa_stat[i]); - atomic_long_add(v, &vm_numa_stat[i]); + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) { + if (pzstats->vm_numa_event[i]) { + v = pzstats->vm_numa_event[i]; + pzstats->vm_numa_event[i] = 0; + zone_numa_event_add(v, zone, i); } + } #endif } #endif #ifdef CONFIG_NUMA -void __inc_numa_state(struct zone *zone, +/* See __count_vm_event comment on why raw_cpu_inc is used. */ +void __count_numa_event(struct zone *zone, enum numa_stat_item item) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; - u16 __percpu *p = pzstats->vm_numa_stat_diff + item; - u16 v; - - v = __this_cpu_inc_return(*p); - if (unlikely(v > NUMA_STATS_THRESHOLD)) { - zone_numa_state_add(v, zone, item); - __this_cpu_write(*p, 0); - } + raw_cpu_inc(pzstats->vm_numa_event[item]); } /* @@ -998,19 +965,16 @@ unsigned long sum_zone_node_page_state(int node, return count; } -/* - * Determine the per node value of a numa stat item. To avoid deviation, - * the per cpu stat number in vm_numa_stat_diff[] is also included. - */ -unsigned long sum_zone_numa_state(int node, +/* Determine the per node value of a numa stat item. */ +unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item) { struct zone *zones = NODE_DATA(node)->node_zones; - int i; unsigned long count = 0; + int i; for (i = 0; i < MAX_NR_ZONES; i++) - count += zone_numa_state_snapshot(zones + i, item); + count += zone_numa_event_state(zones + i, item); return count; } @@ -1689,9 +1653,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", numa_stat_name(i), - zone_numa_state_snapshot(zone, i)); + zone_numa_event_state(zone, i)); #endif seq_printf(m, "\n pagesets"); @@ -1745,7 +1709,7 @@ static const struct seq_operations zoneinfo_op = { }; #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \ - NR_VM_NUMA_STAT_ITEMS + \ + NR_VM_NUMA_EVENT_ITEMS + \ NR_VM_NODE_STAT_ITEMS + \ NR_VM_WRITEBACK_STAT_ITEMS + \ (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \ @@ -1760,6 +1724,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) return NULL; BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS); + fold_vm_numa_events(); v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL); m->private = v; if (!v) @@ -1769,9 +1734,9 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v += NR_VM_ZONE_STAT_ITEMS; #ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - v[i] = global_numa_state(i); - v += NR_VM_NUMA_STAT_ITEMS; + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) + v[i] = global_numa_event_state(i); + v += NR_VM_NUMA_EVENT_ITEMS; #endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { @@ -1941,11 +1906,7 @@ static bool need_update(int cpu) if (memchr_inv(pzstats->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * sizeof(pzstats->vm_stat_diff[0]))) return true; -#ifdef CONFIG_NUMA - if (memchr_inv(pzstats->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * - sizeof(pzstats->vm_numa_stat_diff[0]))) - return true; -#endif + if (last_pgdat == zone->zone_pgdat) continue; last_pgdat = zone->zone_pgdat; -- cgit From 3ac44a346a50988131db124a7e4bb99d3ec71706 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:47 -0700 Subject: mm/vmstat: inline NUMA event counter updates __count_numa_event is small enough to be treated similarly to __count_vm_event so inline it. Link: https://lkml.kernel.org/r/20210512095458.30632-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 10 +++++++++- mm/vmstat.c | 9 --------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 59748bbbba4c..fe32a2210e73 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -238,7 +238,15 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, } #ifdef CONFIG_NUMA -extern void __count_numa_event(struct zone *zone, enum numa_stat_item item); +/* See __count_vm_event comment on why raw_cpu_inc is used. */ +static inline void +__count_numa_event(struct zone *zone, enum numa_stat_item item) +{ + struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; + + raw_cpu_inc(pzstats->vm_numa_event[item]); +} + extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item); diff --git a/mm/vmstat.c b/mm/vmstat.c index 0e27b62e487d..b0534e068166 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -938,15 +938,6 @@ void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) #endif #ifdef CONFIG_NUMA -/* See __count_vm_event comment on why raw_cpu_inc is used. */ -void __count_numa_event(struct zone *zone, - enum numa_stat_item item) -{ - struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; - - raw_cpu_inc(pzstats->vm_numa_event[item]); -} - /* * Determine the per node value of a stat item. This function * is called frequently in a NUMA machine, so try to be as -- cgit From 3e23060b2d0b7eebf37b3b6043ea68da0ebc0646 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:50 -0700 Subject: mm/page_alloc: batch the accounting updates in the bulk allocator Now that the zone_statistics are simple counters that do not require special protection, the bulk allocator accounting updates can be batch updated without adding too much complexity with protected RMW updates or using xchg. Link: https://lkml.kernel.org/r/20210512095458.30632-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 8 ++++++++ mm/page_alloc.c | 30 +++++++++++++----------------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index fe32a2210e73..d6a6cf53b127 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -247,6 +247,14 @@ __count_numa_event(struct zone *zone, enum numa_stat_item item) raw_cpu_inc(pzstats->vm_numa_event[item]); } +static inline void +__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta) +{ + struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; + + raw_cpu_add(pzstats->vm_numa_event[item], delta); +} + extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e03109bdae5..6bb9b87cf7d5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3467,7 +3467,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + long nr_account) { #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; @@ -3480,12 +3481,12 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) local_stat = NUMA_OTHER; if (zone_to_nid(z) == zone_to_nid(preferred_zone)) - __count_numa_event(z, NUMA_HIT); + __count_numa_events(z, NUMA_HIT, nr_account); else { - __count_numa_event(z, NUMA_MISS); - __count_numa_event(preferred_zone, NUMA_FOREIGN); + __count_numa_events(z, NUMA_MISS, nr_account); + __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account); } - __count_numa_event(z, local_stat); + __count_numa_events(z, local_stat, nr_account); #endif } @@ -3531,7 +3532,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); - zone_statistics(preferred_zone, zone); + zone_statistics(preferred_zone, zone, 1); } local_unlock_irqrestore(&pagesets.lock, flags); return page; @@ -3592,7 +3593,7 @@ struct page *rmqueue(struct zone *preferred_zone, get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + zone_statistics(preferred_zone, zone, 1); local_irq_restore(flags); out: @@ -5077,7 +5078,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct alloc_context ac; gfp_t alloc_gfp; unsigned int alloc_flags = ALLOC_WMARK_LOW; - int nr_populated = 0; + int nr_populated = 0, nr_account = 0; if (unlikely(nr_pages <= 0)) return 0; @@ -5154,15 +5155,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto failed_irq; break; } - - /* - * Ideally this would be batched but the best way to do - * that cheaply is to first convert zone_statistics to - * be inaccurate per-cpu counter like vm_events to avoid - * a RMW cycle then do the accounting with IRQs enabled. - */ - __count_zid_vm_events(PGALLOC, zone_idx(zone), 1); - zone_statistics(ac.preferred_zoneref->zone, zone); + nr_account++; prep_new_page(page, 0, gfp, 0); if (page_list) @@ -5172,6 +5165,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } + __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); + zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); + local_unlock_irqrestore(&pagesets.lock, flags); return nr_populated; -- cgit From 43c95bcc51e4e7f3e3cbce01515fe429a4cf12a7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:54 -0700 Subject: mm/page_alloc: reduce duration that IRQs are disabled for VM counters IRQs are left disabled for the zone and node VM event counters. This is unnecessary as the affected counters are allowed to race for preemmption and IRQs. This patch reduces the scope of IRQs being disabled via local_[lock|unlock]_irq on !PREEMPT_RT kernels. One __mod_zone_freepage_state is still called with IRQs disabled. While this could be moved out, it's not free on all architectures as some require IRQs to be disabled for mod_zone_page_state on !PREEMPT_RT kernels. Link: https://lkml.kernel.org/r/20210512095458.30632-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6bb9b87cf7d5..161bcda61520 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3530,11 +3530,11 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp = this_cpu_ptr(zone->per_cpu_pageset); list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); + local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone, 1); } - local_unlock_irqrestore(&pagesets.lock, flags); return page; } @@ -3586,15 +3586,15 @@ struct page *rmqueue(struct zone *preferred_zone, if (!page) page = __rmqueue(zone, order, migratetype, alloc_flags); } while (page && check_new_pages(page, order)); - spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); - local_irq_restore(flags); out: /* Separate test+clear to avoid unnecessary atomics */ @@ -3607,7 +3607,7 @@ out: return page; failed: - local_irq_restore(flags); + spin_unlock_irqrestore(&zone->lock, flags); return NULL; } @@ -5165,11 +5165,11 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } + local_unlock_irqrestore(&pagesets.lock, flags); + __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); - local_unlock_irqrestore(&pagesets.lock, flags); - return nr_populated; failed_irq: -- cgit From 56f0e661ea8c0178e80048df7166653a51ef2c3d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:41:57 -0700 Subject: mm/page_alloc: explicitly acquire the zone lock in __free_pages_ok __free_pages_ok() disables IRQs before calling a common helper free_one_page() that acquires the zone lock. This is not safe according to Documentation/locking/locktypes.rst and in this context, IRQ disabling is not protecting a per_cpu_pages structure either or a local_lock would be used. This patch explicitly acquires the lock with spin_lock_irqsave instead of relying on a helper. This removes the last instance of local_irq_save() in page_alloc.c. Link: https://lkml.kernel.org/r/20210512095458.30632-8-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 161bcda61520..f1a51c163e75 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1590,21 +1590,21 @@ static void __free_pages_ok(struct page *page, unsigned int order, unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); + struct zone *zone = page_zone(page); if (!free_pages_prepare(page, order, true, fpi_flags)) return; migratetype = get_pfnblock_migratetype(page, pfn); - /* - * TODO FIX: Disable IRQs before acquiring IRQ-safe zone->lock - * and protect vmstat updates. - */ - local_irq_save(flags); + spin_lock_irqsave(&zone->lock, flags); __count_vm_events(PGFREE, 1 << order); - free_one_page(page_zone(page), page, pfn, order, migratetype, - fpi_flags); - local_irq_restore(flags); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + spin_unlock_irqrestore(&zone->lock, flags); } void __free_pages_core(struct page *page, unsigned int order) -- cgit From df1acc856923c0a65c28b588585449106c316b71 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:00 -0700 Subject: mm/page_alloc: avoid conflating IRQs disabled with zone->lock Historically when freeing pages, free_one_page() assumed that callers had IRQs disabled and the zone->lock could be acquired with spin_lock(). This confuses the scope of what local_lock_irq is protecting and what zone->lock is protecting in free_unref_page_list in particular. This patch uses spin_lock_irqsave() for the zone->lock in free_one_page() instead of relying on callers to have disabled IRQs. free_unref_page_commit() is changed to only deal with PCP pages protected by the local lock. free_unref_page_list() then first frees isolated pages to the buddy lists with free_one_page() and frees the rest of the pages to the PCP via free_unref_page_commit(). The end result is that free_one_page() is no longer depending on side-effects of local_lock to be correct. Note that this may incur a performance penalty while memory hot-remove is running but that is not a common operation. [lkp@intel.com: Ensure CMA pages get addded to correct pcp list] Link: https://lkml.kernel.org/r/20210512095458.30632-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 75 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f1a51c163e75..dd367e5df8cb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1501,13 +1501,15 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype, fpi_t fpi_flags) { - spin_lock(&zone->lock); + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -3285,31 +3287,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) return true; } -static void free_unref_page_commit(struct page *page, unsigned long pfn) +static void free_unref_page_commit(struct page *page, unsigned long pfn, + int migratetype) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; - int migratetype; - migratetype = get_pcppage_migratetype(page); __count_vm_event(PGFREE); - - /* - * We only track unmovable, reclaimable and movable on pcp lists. - * Free ISOLATE pages back to the allocator because they are being - * offlined but treat HIGHATOMIC as movable pages so we can get those - * areas back if necessary. Otherwise, we may have to free - * excessively into the page allocator - */ - if (migratetype >= MIGRATE_PCPTYPES) { - if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, 0, migratetype, - FPI_NONE); - return; - } - migratetype = MIGRATE_MOVABLE; - } - pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; @@ -3324,12 +3308,29 @@ void free_unref_page(struct page *page) { unsigned long flags; unsigned long pfn = page_to_pfn(page); + int migratetype; if (!free_unref_page_prepare(page, pfn)) return; + /* + * We only track unmovable, reclaimable and movable on pcp lists. + * Place ISOLATE pages on the isolated list because they are being + * offlined but treat HIGHATOMIC as movable pages so we can get those + * areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ + migratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { + free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + return; + } + migratetype = MIGRATE_MOVABLE; + } + local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn); + free_unref_page_commit(page, pfn, migratetype); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3341,22 +3342,44 @@ void free_unref_page_list(struct list_head *list) struct page *page, *next; unsigned long flags, pfn; int batch_count = 0; + int migratetype; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); if (!free_unref_page_prepare(page, pfn)) list_del(&page->lru); + + /* + * Free isolated pages directly to the allocator, see + * comment in free_unref_page. + */ + migratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { + list_del(&page->lru); + free_one_page(page_zone(page), page, pfn, 0, + migratetype, FPI_NONE); + continue; + } + + /* + * Non-isolated types over MIGRATE_PCPTYPES get added + * to the MIGRATE_MOVABLE pcp list. + */ + set_pcppage_migratetype(page, MIGRATE_MOVABLE); + } + set_page_private(page, pfn); } local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { - unsigned long pfn = page_private(page); - + pfn = page_private(page); set_page_private(page, 0); + migratetype = get_pcppage_migratetype(page); trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn); + free_unref_page_commit(page, pfn, migratetype); /* * Guard against excessive IRQ disabled times when we get -- cgit From 902499937e3a82156dcb5069b6df27640480e204 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:03 -0700 Subject: mm/page_alloc: update PGFREE outside the zone lock in __free_pages_ok VM events do not need explicit protection by disabling IRQs so update the counter with IRQs enabled in __free_pages_ok. Link: https://lkml.kernel.org/r/20210512095458.30632-10-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Peter Zijlstra (Intel) Cc: Chuck Lever Cc: Ingo Molnar Cc: Jesper Dangaard Brouer Cc: Michal Hocko Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd367e5df8cb..37ce0c2f3bae 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1600,13 +1600,14 @@ static void __free_pages_ok(struct page *page, unsigned int order, migratetype = get_pfnblock_migratetype(page, pfn); spin_lock_irqsave(&zone->lock, flags); - __count_vm_events(PGFREE, 1 << order); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); + + __count_vm_events(PGFREE, 1 << order); } void __free_pages_core(struct page *page, unsigned int order) -- cgit From 151e084af4946344fe0d021f4110b69edaac1e8d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 28 Jun 2021 19:42:06 -0700 Subject: mm: page_alloc: dump migrate-failed pages only at -EBUSY alloc_contig_dump_pages() aims for helping debugging page migration failure by elevated page refcount compared to expected_count. (for the detail, please look at migrate_page_move_mapping) However, -ENOMEM is just the case that system is under memory pressure state, not relevant with page refcount at all. Thus, the dumping page list is not helpful for the debugging point of view. Link: https://lkml.kernel.org/r/YKa2Wyo9xqIErpfa@google.com Signed-off-by: Minchan Kim Reviewed-by: David Hildenbrand Cc: Suren Baghdasaryan Cc: John Dias Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37ce0c2f3bae..941a75b9fb5a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8800,7 +8800,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, lru_cache_enable(); if (ret < 0) { - alloc_contig_dump_pages(&cc->migratepages); + if (ret == -EBUSY) + alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); return ret; } -- cgit From bbbecb35a41cb5c63ef78e14cc8b95fa9130bc1a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:09 -0700 Subject: mm/page_alloc: delete vm.percpu_pagelist_fraction Patch series "Calculate pcp->high based on zone sizes and active CPUs", v2. The per-cpu page allocator (PCP) is meant to reduce contention on the zone lock but the sizing of batch and high is archaic and neither takes the zone size into account or the number of CPUs local to a zone. With larger zones and more CPUs per node, the contention is getting worse. Furthermore, the fact that vm.percpu_pagelist_fraction adjusts both batch and high values means that the sysctl can reduce zone lock contention but also increase allocation latencies. This series disassociates pcp->high from pcp->batch and then scales pcp->high based on the size of the local zone with limited impact to reclaim and accounting for active CPUs but leaves pcp->batch static. It also adapts the number of pages that can be on the pcp list based on recent freeing patterns. The motivation is partially to adjust to larger memory sizes but is also driven by the fact that large batches of page freeing via release_pages() often shows zone contention as a major part of the problem. Another is a bug report based on an older kernel where a multi-terabyte process can takes several minutes to exit. A workaround was to use vm.percpu_pagelist_fraction to increase the pcp->high value but testing indicated that a production workload could not use the same values because of an increase in allocation latencies. Unfortunately, I cannot reproduce this test case myself as the multi-terabyte machines are in active use but it should alleviate the problem. The series aims to address both and partially acts as a pre-requisite. pcp only works with order-0 which is useless for SLUB (when using high orders) and THP (unconditionally). To store high-order pages on PCP, the pcp->high values need to be increased first. This patch (of 6): The vm.percpu_pagelist_fraction is used to increase the batch and high limits for the per-cpu page allocator (PCP). The intent behind the sysctl is to reduce zone lock acquisition when allocating/freeing pages but it has a problem. While it can decrease contention, it can also increase latency on the allocation side due to unreasonably large batch sizes. This leads to games where an administrator adjusts percpu_pagelist_fraction on the fly to work around contention and allocation latency problems. This series aims to alleviate the problems with zone lock contention while avoiding the allocation-side latency problems. For the purposes of review, it's easier to remove this sysctl now and reintroduce a similar sysctl later in the series that deals only with pcp->high. Link: https://lkml.kernel.org/r/20210525080119.5455-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Dave Hansen Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 19 ------------ include/linux/mmzone.h | 3 -- kernel/sysctl.c | 8 ----- mm/page_alloc.c | 55 +++------------------------------ 4 files changed, 4 insertions(+), 81 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 586cd4b86428..2fcafccb53a8 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -64,7 +64,6 @@ Currently, these files are in /proc/sys/vm: - overcommit_ratio - page-cluster - panic_on_oom -- percpu_pagelist_fraction - stat_interval - stat_refresh - numa_stat @@ -790,24 +789,6 @@ panic_on_oom=2+kdump gives you very strong tool to investigate why oom happens. You can get snapshot. -percpu_pagelist_fraction -======================== - -This is the fraction of pages at most (high mark pcp->high) in each zone that -are allocated for each per cpu page list. The min value for this is 8. It -means that we don't allow more than 1/8th of pages in each zone to be -allocated in any single per_cpu_pagelist. This entry only changes the value -of hot per cpu pagelists. User can specify a number like 100 to allocate -1/100th of each zone to each per cpu page list. - -The batch value of each per cpu pagelist is also updated as a result. It is -set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) - -The initial value is zero. Kernel does not use this value at boot time to set -the high water marks for each per cpu page list. If the user writes '0' to this -sysctl, it will revert to this default behavior. - - stat_interval ============= diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f86018d5e362..7937a1d1d166 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1027,15 +1027,12 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); -int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, - void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int numa_zonelist_order_handler(struct ctl_table *, int, void *, size_t *, loff_t *); -extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d4a78e08f6d8..51213c33171e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2908,14 +2908,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &one_thousand, }, - { - .procname = "percpu_pagelist_fraction", - .data = &percpu_pagelist_fraction, - .maxlen = sizeof(percpu_pagelist_fraction), - .mode = 0644, - .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = SYSCTL_ZERO, - }, { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 941a75b9fb5a..5abf2c1d4c58 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -120,7 +120,6 @@ typedef int __bitwise fpi_t; /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); -#define MIN_PERCPU_PAGELIST_FRACTION (8) struct pagesets { local_lock_t lock; @@ -193,7 +192,6 @@ EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; -int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); EXPORT_SYMBOL(init_on_alloc); @@ -6735,22 +6733,15 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h /* * Calculate and set new high and batch values for all per-cpu pagesets of a - * zone, based on the zone's size and the percpu_pagelist_fraction sysctl. + * zone based on the zone's size. */ static void zone_set_pageset_high_and_batch(struct zone *zone) { unsigned long new_high, new_batch; - if (percpu_pagelist_fraction) { - new_high = zone_managed_pages(zone) / percpu_pagelist_fraction; - new_batch = max(1UL, new_high / 4); - if ((new_high / 4) > (PAGE_SHIFT * 8)) - new_batch = PAGE_SHIFT * 8; - } else { - new_batch = zone_batchsize(zone); - new_high = 6 * new_batch; - new_batch = max(1UL, 1 * new_batch); - } + new_batch = zone_batchsize(zone); + new_high = 6 * new_batch; + new_batch = max(1UL, 1 * new_batch); if (zone->pageset_high == new_high && zone->pageset_batch == new_batch) @@ -8413,44 +8404,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } -/* - * percpu_pagelist_fraction - changes the pcp->high for each zone on each - * cpu. It is the fraction of total pages in each zone that a hot per cpu - * pagelist can have before it gets flushed back to buddy allocator. - */ -int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - struct zone *zone; - int old_percpu_pagelist_fraction; - int ret; - - mutex_lock(&pcp_batch_high_lock); - old_percpu_pagelist_fraction = percpu_pagelist_fraction; - - ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || ret < 0) - goto out; - - /* Sanity checking to avoid pcp imbalance */ - if (percpu_pagelist_fraction && - percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { - percpu_pagelist_fraction = old_percpu_pagelist_fraction; - ret = -EINVAL; - goto out; - } - - /* No change? */ - if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) - goto out; - - for_each_populated_zone(zone) - zone_set_pageset_high_and_batch(zone); -out: - mutex_unlock(&pcp_batch_high_lock); - return ret; -} - #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but -- cgit From b92ca18e8ca596f4f3d80c1fe833bc57a1b2458c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:12 -0700 Subject: mm/page_alloc: disassociate the pcp->high from pcp->batch The pcp high watermark is based on the batch size but there is no relationship between them other than it is convenient to use early in boot. This patch takes the first step and bases pcp->high on the zone low watermark split across the number of CPUs local to a zone while the batch size remains the same to avoid increasing allocation latencies. The intent behind the default pcp->high is "set the number of PCP pages such that if they are all full that background reclaim is not started prematurely". Note that in this patch the pcp->high values are adjusted after memory hotplug events, min_free_kbytes adjustments and watermark scale factor adjustments but not CPU hotplug events which is handled later in the series. On a test KVM instance; Before grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 378 batch: 63 After grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 649 batch: 63 [mgorman@techsingularity.net: fix __setup_per_zone_wmarks for parallel memory hotplug] Link: https://lkml.kernel.org/r/20210528105925.GN30378@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 +++--- mm/page_alloc.c | 62 +++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 70620d0dd923..974a565797d8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -961,7 +961,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL); - zone_pcp_update(zone); /* Basic onlining is complete, allow allocation of onlined pages. */ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); @@ -974,6 +973,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z */ shuffle_zone(zone); + /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); kswapd_run(nid); @@ -1829,13 +1829,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); adjust_present_page_count(zone, -nr_pages); + /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); - } else - zone_pcp_update(zone); + } node_states_clear_node(node, &arg); if (arg.status_change_nid >= 0) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5abf2c1d4c58..19ec81d403a0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2174,14 +2174,6 @@ void __init page_alloc_init_late(void) /* Block until all are initialised */ wait_for_completion(&pgdat_init_all_done_comp); - /* - * The number of managed pages has changed due to the initialisation - * so the pcpu batch and high limits needs to be updated or the limits - * will be artificially small. - */ - for_each_populated_zone(zone) - zone_pcp_update(zone); - /* * We initialized the rest of the deferred pages. Permanently disable * on-demand struct page initialization. @@ -6633,13 +6625,12 @@ static int zone_batchsize(struct zone *zone) int batch; /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. + * The number of pages to batch allocate is either ~0.1% + * of the zone or 1MB, whichever is smaller. The batch + * size is striking a balance between allocation latency + * and zone lock contention. */ - batch = zone_managed_pages(zone) / 1024; - /* But no more than a meg. */ - if (batch * PAGE_SIZE > 1024 * 1024) - batch = (1024 * 1024) / PAGE_SIZE; + batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -6676,6 +6667,34 @@ static int zone_batchsize(struct zone *zone) #endif } +static int zone_highsize(struct zone *zone, int batch) +{ +#ifdef CONFIG_MMU + int high; + int nr_local_cpus; + + /* + * The high value of the pcp is based on the zone low watermark + * so that if they are full then background reclaim will not be + * started prematurely. The value is split across all online CPUs + * local to the zone. Note that early in boot that CPUs may not be + * online yet. + */ + nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))); + high = low_wmark_pages(zone) / nr_local_cpus; + + /* + * Ensure high is at least batch*4. The multiple is based on the + * historical relationship between high and batch. + */ + high = max(high, batch << 2); + + return high; +#else + return 0; +#endif +} + /* * pcp->high and pcp->batch values are related and generally batch is lower * than high. They are also related to pcp->count such that count is lower @@ -6737,11 +6756,10 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h */ static void zone_set_pageset_high_and_batch(struct zone *zone) { - unsigned long new_high, new_batch; + int new_high, new_batch; - new_batch = zone_batchsize(zone); - new_high = 6 * new_batch; - new_batch = max(1UL, 1 * new_batch); + new_batch = max(1, zone_batchsize(zone)); + new_high = zone_highsize(zone, new_batch); if (zone->pageset_high == new_high && zone->pageset_batch == new_batch) @@ -8222,11 +8240,19 @@ static void __setup_per_zone_wmarks(void) */ void setup_per_zone_wmarks(void) { + struct zone *zone; static DEFINE_SPINLOCK(lock); spin_lock(&lock); __setup_per_zone_wmarks(); spin_unlock(&lock); + + /* + * The watermark size have changed so update the pcpu batch + * and high limits or the limits may be inappropriate. + */ + for_each_zone(zone) + zone_pcp_update(zone); } /* -- cgit From 04f8cfeaed0849e702278378bce3867577ca45fb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:15 -0700 Subject: mm/page_alloc: adjust pcp->high after CPU hotplug events The PCP high watermark is based on the number of online CPUs so the watermarks must be adjusted during CPU hotplug. At the time of hot-remove, the number of online CPUs is already adjusted but during hot-add, a delta needs to be applied to update PCP to the correct value. After this patch is applied, the high watermarks are adjusted correctly. # grep high: /proc/zoneinfo | tail -1 high: 649 # echo 0 > /sys/devices/system/cpu/cpu4/online # grep high: /proc/zoneinfo | tail -1 high: 664 # echo 1 > /sys/devices/system/cpu/cpu4/online # grep high: /proc/zoneinfo | tail -1 high: 649 Link: https://lkml.kernel.org/r/20210525080119.5455-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuhotplug.h | 2 +- mm/internal.h | 2 +- mm/page_alloc.c | 38 +++++++++++++++++++++++++++----------- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4a62b3980642..47e13582d9fc 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -54,7 +54,7 @@ enum cpuhp_state { CPUHP_MM_MEMCQ_DEAD, CPUHP_PERCPU_CNT_DEAD, CPUHP_RADIX_DEAD, - CPUHP_PAGE_ALLOC_DEAD, + CPUHP_PAGE_ALLOC, CPUHP_NET_DEV_DEAD, CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_IOVA_DEAD, diff --git a/mm/internal.h b/mm/internal.h index 2946dfa0f245..18e5fb4d225f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -206,7 +206,7 @@ extern int user_min_free_kbytes; extern void free_unref_page(struct page *page); extern void free_unref_page_list(struct list_head *list); -extern void zone_pcp_update(struct zone *zone); +extern void zone_pcp_update(struct zone *zone, int cpu_online); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 19ec81d403a0..8d196a803820 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6667,7 +6667,7 @@ static int zone_batchsize(struct zone *zone) #endif } -static int zone_highsize(struct zone *zone, int batch) +static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU int high; @@ -6678,9 +6678,10 @@ static int zone_highsize(struct zone *zone, int batch) * so that if they are full then background reclaim will not be * started prematurely. The value is split across all online CPUs * local to the zone. Note that early in boot that CPUs may not be - * online yet. + * online yet and that during CPU hotplug that the cpumask is not + * yet updated when a CPU is being onlined. */ - nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))); + nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online; high = low_wmark_pages(zone) / nr_local_cpus; /* @@ -6754,12 +6755,12 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h * Calculate and set new high and batch values for all per-cpu pagesets of a * zone based on the zone's size. */ -static void zone_set_pageset_high_and_batch(struct zone *zone) +static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) { int new_high, new_batch; new_batch = max(1, zone_batchsize(zone)); - new_high = zone_highsize(zone, new_batch); + new_high = zone_highsize(zone, new_batch, cpu_online); if (zone->pageset_high == new_high && zone->pageset_batch == new_batch) @@ -6789,7 +6790,7 @@ void __meminit setup_zone_pageset(struct zone *zone) per_cpu_pages_init(pcp, pzstats); } - zone_set_pageset_high_and_batch(zone); + zone_set_pageset_high_and_batch(zone, 0); } /* @@ -8044,6 +8045,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) static int page_alloc_cpu_dead(unsigned int cpu) { + struct zone *zone; lru_add_drain_cpu(cpu); drain_pages(cpu); @@ -8064,6 +8066,19 @@ static int page_alloc_cpu_dead(unsigned int cpu) * race with what we are doing. */ cpu_vm_stats_fold(cpu); + + for_each_populated_zone(zone) + zone_pcp_update(zone, 0); + + return 0; +} + +static int page_alloc_cpu_online(unsigned int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zone_pcp_update(zone, 1); return 0; } @@ -8089,8 +8104,9 @@ void __init page_alloc_init(void) hashdist = 0; #endif - ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, - "mm/page_alloc:dead", NULL, + ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC, + "mm/page_alloc:pcp", + page_alloc_cpu_online, page_alloc_cpu_dead); WARN_ON(ret < 0); } @@ -8252,7 +8268,7 @@ void setup_per_zone_wmarks(void) * and high limits or the limits may be inappropriate. */ for_each_zone(zone) - zone_pcp_update(zone); + zone_pcp_update(zone, 0); } /* @@ -9053,10 +9069,10 @@ EXPORT_SYMBOL(free_contig_range); * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalculated. */ -void __meminit zone_pcp_update(struct zone *zone) +void zone_pcp_update(struct zone *zone, int cpu_online) { mutex_lock(&pcp_batch_high_lock); - zone_set_pageset_high_and_batch(zone); + zone_set_pageset_high_and_batch(zone, cpu_online); mutex_unlock(&pcp_batch_high_lock); } -- cgit From 3b12e7e97938424de2bb1b95ba0bd6a49bad39f9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:18 -0700 Subject: mm/page_alloc: scale the number of pages that are batch freed When a task is freeing a large number of order-0 pages, it may acquire the zone->lock multiple times freeing pages in batches. This may unnecessarily contend on the zone lock when freeing very large number of pages. This patch adapts the size of the batch based on the recent pattern to scale the batch size for subsequent frees. As the machines I used were not large enough to test this are not large enough to illustrate a problem, a debugging patch shows patterns like the following (slightly editted for clarity) Baseline vanilla kernel time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 time-unmap-14426 [...] free_pcppages_bulk: free 63 count 378 high 378 With patches time-unmap-7724 [...] free_pcppages_bulk: free 126 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 252 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 504 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 751 count 814 high 814 time-unmap-7724 [...] free_pcppages_bulk: free 751 count 814 high 814 Link: https://lkml.kernel.org/r/20210525080119.5455-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Dave Hansen Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 ++- mm/page_alloc.c | 41 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7937a1d1d166..0a86b2890a16 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -343,8 +343,9 @@ struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ + short free_factor; /* batch scaling factor during free */ #ifdef CONFIG_NUMA - int expire; /* When 0, remote pagesets are drained */ + short expire; /* When 0, remote pagesets are drained */ #endif /* Lists of pages, one per migrate type stored on the pcp-lists */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d196a803820..e1d1825a2611 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3278,18 +3278,47 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) return true; } +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) +{ + int min_nr_free, max_nr_free; + + /* Check for PCP disabled or boot pageset */ + if (unlikely(high < batch)) + return 1; + + /* Leave at least pcp->batch pages on the list */ + min_nr_free = batch; + max_nr_free = high - batch; + + /* + * Double the number of pages freed each time there is subsequent + * freeing of pages without any allocation. + */ + batch <<= pcp->free_factor; + if (batch < max_nr_free) + pcp->free_factor++; + batch = clamp(batch, min_nr_free, max_nr_free); + + return batch; +} + static void free_unref_page_commit(struct page *page, unsigned long pfn, int migratetype) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; + int high; __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; - if (pcp->count >= READ_ONCE(pcp->high)) - free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); + high = READ_ONCE(pcp->high); + if (pcp->count >= high) { + int batch = READ_ONCE(pcp->batch); + + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp); + } } /* @@ -3541,7 +3570,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, unsigned long flags; local_lock_irqsave(&pagesets.lock, flags); + + /* + * On allocation, reduce the number of pages that are batch freed. + * See nr_pcp_free() where free_factor is increased for subsequent + * frees. + */ pcp = this_cpu_ptr(zone->per_cpu_pageset); + pcp->free_factor >>= 1; list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); local_unlock_irqrestore(&pagesets.lock, flags); @@ -6737,6 +6773,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta */ pcp->high = BOOT_PAGESET_HIGH; pcp->batch = BOOT_PAGESET_BATCH; + pcp->free_factor = 0; } static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, -- cgit From c49c2c47dab6b8d45022b3fabf0642a0e62e3109 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:21 -0700 Subject: mm/page_alloc: limit the number of pages on PCP lists when reclaim is active When kswapd is active then direct reclaim is potentially active. In either case, it is possible that a zone would be balanced if pages were not trapped on PCP lists. Instead of draining remote pages, simply limit the size of the PCP lists while kswapd is active. Link: https://lkml.kernel.org/r/20210525080119.5455-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 19 ++++++++++++++++++- mm/vmscan.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0a86b2890a16..b2f40d64bc4b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -647,6 +647,7 @@ enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ + ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ }; static inline unsigned long zone_managed_pages(struct zone *zone) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e1d1825a2611..adf35ccfd8e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3302,6 +3302,23 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch) return batch; } +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) +{ + int high = READ_ONCE(pcp->high); + + if (unlikely(!high)) + return 0; + + if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) + return high; + + /* + * If reclaim is active, limit the number of pages that can be + * stored on pcp lists + */ + return min(READ_ONCE(pcp->batch) << 2, high); +} + static void free_unref_page_commit(struct page *page, unsigned long pfn, int migratetype) { @@ -3313,7 +3330,7 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn, pcp = this_cpu_ptr(zone->per_cpu_pageset); list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; - high = READ_ONCE(pcp->high); + high = nr_pcp_high(pcp, zone); if (pcp->count >= high) { int batch = READ_ONCE(pcp->batch); diff --git a/mm/vmscan.c b/mm/vmscan.c index f96d62159720..d7c3cb8688dd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3722,6 +3722,38 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, return sc->nr_scanned >= sc->nr_to_reclaim; } +/* Page allocator PCP high watermark is lowered if reclaim is active. */ +static inline void +update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) +{ + int i; + struct zone *zone; + + for (i = 0; i <= highest_zoneidx; i++) { + zone = pgdat->node_zones + i; + + if (!managed_zone(zone)) + continue; + + if (active) + set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); + else + clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); + } +} + +static inline void +set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) +{ + update_reclaim_active(pgdat, highest_zoneidx, true); +} + +static inline void +clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) +{ + update_reclaim_active(pgdat, highest_zoneidx, false); +} + /* * For kswapd, balance_pgdat() will reclaim pages across a node from zones * that are eligible for use by the caller until at least one zone is @@ -3774,6 +3806,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) boosted = nr_boost_reclaim; restart: + set_reclaim_active(pgdat, highest_zoneidx); sc.priority = DEF_PRIORITY; do { unsigned long nr_reclaimed = sc.nr_reclaimed; @@ -3907,6 +3940,8 @@ restart: pgdat->kswapd_failures++; out: + clear_reclaim_active(pgdat, highest_zoneidx); + /* If reclaim was boosted, account for the reclaim done in this pass */ if (boosted) { unsigned long flags; -- cgit From 74f44822097c665041010994502b5971d6cd9f04 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:24 -0700 Subject: mm/page_alloc: introduce vm.percpu_pagelist_high_fraction This introduces a new sysctl vm.percpu_pagelist_high_fraction. It is similar to the old vm.percpu_pagelist_fraction. The old sysctl increased both pcp->batch and pcp->high with the higher pcp->high potentially reducing zone->lock contention. However, the higher pcp->batch value also potentially increased allocation latency while the PCP was refilled. This sysctl only adjusts pcp->high so that zone->lock contention is potentially reduced but allocation latency during a PCP refill remains the same. # grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 649 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=8 # grep -E "high:|batch" /proc/zoneinfo | tail -2 high: 35071 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=64 high: 4383 batch: 63 # sysctl vm.percpu_pagelist_high_fraction=0 high: 649 batch: 63 [mgorman@techsingularity.net: fix documentation] Link: https://lkml.kernel.org/r/20210528151010.GQ30378@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Dave Hansen Acked-by: Vlastimil Babka Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 21 ++++++++++ include/linux/mmzone.h | 3 ++ kernel/sysctl.c | 8 ++++ mm/page_alloc.c | 69 +++++++++++++++++++++++++++++---- 4 files changed, 94 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 2fcafccb53a8..2da25735a629 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -64,6 +64,7 @@ Currently, these files are in /proc/sys/vm: - overcommit_ratio - page-cluster - panic_on_oom +- percpu_pagelist_high_fraction - stat_interval - stat_refresh - numa_stat @@ -789,6 +790,26 @@ panic_on_oom=2+kdump gives you very strong tool to investigate why oom happens. You can get snapshot. +percpu_pagelist_high_fraction +============================= + +This is the fraction of pages in each zone that are can be stored to +per-cpu page lists. It is an upper boundary that is divided depending +on the number of online CPUs. The min value for this is 8 which means +that we do not allow more than 1/8th of pages in each zone to be stored +on per-cpu page lists. This entry only changes the value of hot per-cpu +page lists. A user can specify a number like 100 to allocate 1/100th of +each zone between per-cpu lists. + +The batch value of each per-cpu page list remains the same regardless of +the value of the high fraction so allocation latencies are unaffected. + +The initial value is zero. Kernel uses this value to set the high pcp->high +mark based on the low watermark for the zone and the number of local +online CPUs. If the user writes '0' to this sysctl, it will revert to +this default behavior. + + stat_interval ============= diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b2f40d64bc4b..7d206ca850c7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1029,12 +1029,15 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int, + void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int numa_zonelist_order_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +extern int percpu_pagelist_high_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 51213c33171e..69d925f1e5da 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2908,6 +2908,14 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &one_thousand, }, + { + .procname = "percpu_pagelist_high_fraction", + .data = &percpu_pagelist_high_fraction, + .maxlen = sizeof(percpu_pagelist_high_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_high_fraction_sysctl_handler, + .extra1 = SYSCTL_ZERO, + }, { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index adf35ccfd8e5..cfc4071310fb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -120,6 +120,7 @@ typedef int __bitwise fpi_t; /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) struct pagesets { local_lock_t lock; @@ -192,6 +193,7 @@ EXPORT_SYMBOL(_totalram_pages); unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; +int percpu_pagelist_high_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); EXPORT_SYMBOL(init_on_alloc); @@ -6725,17 +6727,32 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) #ifdef CONFIG_MMU int high; int nr_local_cpus; + unsigned long total_pages; + + if (!percpu_pagelist_high_fraction) { + /* + * By default, the high value of the pcp is based on the zone + * low watermark so that if they are full then background + * reclaim will not be started prematurely. + */ + total_pages = low_wmark_pages(zone); + } else { + /* + * If percpu_pagelist_high_fraction is configured, the high + * value is based on a fraction of the managed pages in the + * zone. + */ + total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction; + } /* - * The high value of the pcp is based on the zone low watermark - * so that if they are full then background reclaim will not be - * started prematurely. The value is split across all online CPUs - * local to the zone. Note that early in boot that CPUs may not be - * online yet and that during CPU hotplug that the cpumask is not - * yet updated when a CPU is being onlined. + * Split the high value across all online CPUs local to the zone. Note + * that early in boot that CPUs may not be online yet and that during + * CPU hotplug that the cpumask is not yet updated when a CPU is being + * onlined. */ nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online; - high = low_wmark_pages(zone) / nr_local_cpus; + high = total_pages / nr_local_cpus; /* * Ensure high is at least batch*4. The multiple is based on the @@ -8500,6 +8517,44 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } +/* + * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu + * pagelist can have before it gets flushed back to buddy allocator. + */ +int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int old_percpu_pagelist_high_fraction; + int ret; + + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_high_fraction && + percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) { + percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction) + goto out; + + for_each_populated_zone(zone) + zone_set_pageset_high_and_batch(zone, 0); +out: + mutex_unlock(&pcp_batch_high_lock); + return ret; +} + #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES /* * Returns the number of pages that arch has reserved but -- cgit From 777c00f5ede4fcb9ae49a2a957bec26d4d8f4c29 Mon Sep 17 00:00:00 2001 From: Dong Aisheng Date: Mon, 28 Jun 2021 19:42:27 -0700 Subject: mm: drop SECTION_SHIFT in code comments Actually SECTIONS_SHIFT is used in the kernel code, so the code comments is strictly incorrect. And since commit bbeae5b05ef6 ("mm: move page flags layout to separate header"), SECTIONS_SHIFT definition has been moved to include/linux/page-flags-layout.h, since code itself looks quite straighforward, instead of moving the code comment into the new place as well, we just simply remove it. This also fixed a checkpatch complain derived from the original code: WARNING: please, no space before tabs + * SECTIONS_SHIFT ^I^I#bits space required to store a section #$ Link: https://lkml.kernel.org/r/20210531091908.1738465-2-aisheng.dong@nxp.com Signed-off-by: Dong Aisheng Suggested-by: Yu Zhao Reviewed-by: Yu Zhao Cc: Andrey Konovalov Cc: Catalin Marinas Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7d206ca850c7..3e62e8ef68b5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1200,8 +1200,6 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #ifdef CONFIG_SPARSEMEM /* - * SECTION_SHIFT #bits space required to store a section # - * * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ -- cgit From e47aa90568de326625b19d7bc872f8d70b0820b0 Mon Sep 17 00:00:00 2001 From: Dong Aisheng Date: Mon, 28 Jun 2021 19:42:30 -0700 Subject: mm/page_alloc: improve memmap_pages dbg msg Make debug message more accurate. Link: https://lkml.kernel.org/r/20210531091908.1738465-6-aisheng.dong@nxp.com Signed-off-by: Dong Aisheng Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfc4071310fb..2a306c34fda7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7383,7 +7383,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) pr_debug(" %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else - pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", + pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); } -- cgit From f7ec104458e00d27a190348ac3a513f3df3699a4 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 28 Jun 2021 19:42:33 -0700 Subject: mm/page_alloc: fix counting of managed_pages commit f63661566fad ("mm/page_alloc.c: clear out zone->lowmem_reserve[] if the zone is empty") clears out zone->lowmem_reserve[] if zone is empty. But when zone is not empty and sysctl_lowmem_reserve_ratio[i] is set to zero, zone_managed_pages(zone) is not counted in the managed_pages either. This is inconsistent with the description of lowmem_reserve, so fix it. Link: https://lkml.kernel.org/r/20210527125707.3760259-1-liushixin2@huawei.com Fixes: f63661566fad ("mm/page_alloc.c: clear out zone->lowmem_reserve[] if the zone is empty") Signed-off-by: Liu Shixin Reported-by: yangerkun Reviewed-by: Baoquan He Acked-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a306c34fda7..fc151f6a7dbd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8240,14 +8240,14 @@ static void setup_per_zone_lowmem_reserve(void) unsigned long managed_pages = 0; for (j = i + 1; j < MAX_NR_ZONES; j++) { - if (clear) { - zone->lowmem_reserve[j] = 0; - } else { - struct zone *upper_zone = &pgdat->node_zones[j]; + struct zone *upper_zone = &pgdat->node_zones[j]; + + managed_pages += zone_managed_pages(upper_zone); - managed_pages += zone_managed_pages(upper_zone); + if (clear) + zone->lowmem_reserve[j] = 0; + else zone->lowmem_reserve[j] = managed_pages / ratio; - } } } } -- cgit From 21d02f8f8464e27434f477c73431075197a9f72f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:42:36 -0700 Subject: mm/page_alloc: move free_the_page Patch series "Allow high order pages to be stored on PCP", v2. The per-cpu page allocator (PCP) only handles order-0 pages. With the series "Use local_lock for pcp protection and reduce stat overhead" and "Calculate pcp->high based on zone sizes and active CPUs", it's now feasible to store high-order pages on PCP lists. This small series allows PCP to store "cheap" orders where cheap is determined by PAGE_ALLOC_COSTLY_ORDER and THP-sized allocations. This patch (of 2): In the next page, free_compount_page is going to use the common helper free_the_page. This patch moves the definition to ease review. No functional change. Link: https://lkml.kernel.org/r/20210603142220.10851-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210603142220.10851-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Jesper Dangaard Brouer Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc151f6a7dbd..58f7a321598f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -687,6 +687,14 @@ out: add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +static inline void free_the_page(struct page *page, unsigned int order) +{ + if (order == 0) /* Via pcp? */ + free_unref_page(page); + else + __free_pages_ok(page, order, FPI_NONE); +} + /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -5349,14 +5357,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); -static inline void free_the_page(struct page *page, unsigned int order) -{ - if (order == 0) /* Via pcp? */ - free_unref_page(page); - else - __free_pages_ok(page, order, FPI_NONE); -} - /** * __free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). -- cgit From fdb7d9b7acd02f573ae4fc0c7772f6b5c6b1bad0 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:42:39 -0700 Subject: alpha: remove DISCONTIGMEM and NUMA Patch series "Remove DISCONTIGMEM memory model", v3. SPARSEMEM memory model was supposed to entirely replace DISCONTIGMEM a (long) while ago. The last architectures that used DISCONTIGMEM were updated to use other memory models in v5.11 and it is about the time to entirely remove DISCONTIGMEM from the kernel. This set removes DISCONTIGMEM from alpha, arc and m68k, simplifies memory model selection in mm/Kconfig and replaces usage of redundant CONFIG_NEED_MULTIPLE_NODES and CONFIG_FLAT_NODE_MEM_MAP with CONFIG_NUMA and CONFIG_FLATMEM respectively. I've also removed NUMA support on alpha that was BROKEN for more than 15 years. There were also minor updates all over arch/ to remove mentions of DISCONTIGMEM in comments and #ifdefs. This patch (of 9): NUMA is marked broken on alpha for more than 15 years and DISCONTIGMEM was replaced with SPARSEMEM in v5.11. Remove both NUMA and DISCONTIGMEM support from alpha. Link: https://lkml.kernel.org/r/20210608091316.3622-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210608091316.3622-2-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 22 ---- arch/alpha/include/asm/machvec.h | 6 - arch/alpha/include/asm/mmzone.h | 100 ----------------- arch/alpha/include/asm/pgtable.h | 4 - arch/alpha/include/asm/topology.h | 39 ------- arch/alpha/kernel/core_marvel.c | 53 +-------- arch/alpha/kernel/core_wildfire.c | 29 +---- arch/alpha/kernel/pci_iommu.c | 29 ----- arch/alpha/kernel/proto.h | 8 -- arch/alpha/kernel/setup.c | 16 --- arch/alpha/kernel/sys_marvel.c | 5 - arch/alpha/kernel/sys_wildfire.c | 5 - arch/alpha/mm/Makefile | 2 - arch/alpha/mm/init.c | 3 - arch/alpha/mm/numa.c | 223 -------------------------------------- 15 files changed, 4 insertions(+), 540 deletions(-) delete mode 100644 arch/alpha/include/asm/mmzone.h delete mode 100644 arch/alpha/mm/numa.c diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 5998106faa60..8954216b9956 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -549,29 +549,12 @@ config NR_CPUS MARVEL support can handle a maximum of 32 CPUs, all the others with working support have a maximum of 4 CPUs. -config ARCH_DISCONTIGMEM_ENABLE - bool "Discontiguous Memory Support" - depends on BROKEN - help - Say Y to support efficient handling of discontiguous physical memory, - for architectures which are either NUMA (Non-Uniform Memory Access) - or have huge holes in the physical address space for other reasons. - See for more. - config ARCH_SPARSEMEM_ENABLE bool "Sparse Memory Support" help Say Y to support efficient handling of discontiguous physical memory, for systems that have huge holes in the physical address space. -config NUMA - bool "NUMA Support (EXPERIMENTAL)" - depends on DISCONTIGMEM && BROKEN - help - Say Y to compile the kernel to support NUMA (Non-Uniform Memory - Access). This option is for configuring high-end multiprocessor - server machines. If in doubt, say N. - config ALPHA_WTINT bool "Use WTINT" if ALPHA_SRM || ALPHA_GENERIC default y if ALPHA_QEMU @@ -596,11 +579,6 @@ config ALPHA_WTINT If unsure, say N. -config NODES_SHIFT - int - default "7" - depends on NEED_MULTIPLE_NODES - # LARGE_VMALLOC is racy, if you *really* need it then fix it first config ALPHA_LARGE_VMALLOC bool diff --git a/arch/alpha/include/asm/machvec.h b/arch/alpha/include/asm/machvec.h index a4e96e2bec74..e49fabce7b33 100644 --- a/arch/alpha/include/asm/machvec.h +++ b/arch/alpha/include/asm/machvec.h @@ -99,12 +99,6 @@ struct alpha_machine_vector const char *vector_name; - /* NUMA information */ - int (*pa_to_nid)(unsigned long); - int (*cpuid_to_nid)(int); - unsigned long (*node_mem_start)(int); - unsigned long (*node_mem_size)(int); - /* System specific parameters. */ union { struct { diff --git a/arch/alpha/include/asm/mmzone.h b/arch/alpha/include/asm/mmzone.h deleted file mode 100644 index 86644604d977..000000000000 --- a/arch/alpha/include/asm/mmzone.h +++ /dev/null @@ -1,100 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Written by Kanoj Sarcar (kanoj@sgi.com) Aug 99 - * Adapted for the alpha wildfire architecture Jan 2001. - */ -#ifndef _ASM_MMZONE_H_ -#define _ASM_MMZONE_H_ - -#ifdef CONFIG_DISCONTIGMEM - -#include - -/* - * Following are macros that are specific to this numa platform. - */ - -extern pg_data_t node_data[]; - -#define alpha_pa_to_nid(pa) \ - (alpha_mv.pa_to_nid \ - ? alpha_mv.pa_to_nid(pa) \ - : (0)) -#define node_mem_start(nid) \ - (alpha_mv.node_mem_start \ - ? alpha_mv.node_mem_start(nid) \ - : (0UL)) -#define node_mem_size(nid) \ - (alpha_mv.node_mem_size \ - ? alpha_mv.node_mem_size(nid) \ - : ((nid) ? (0UL) : (~0UL))) - -#define pa_to_nid(pa) alpha_pa_to_nid(pa) -#define NODE_DATA(nid) (&node_data[(nid)]) - -#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) - -#if 1 -#define PLAT_NODE_DATA_LOCALNR(p, n) \ - (((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn) -#else -static inline unsigned long -PLAT_NODE_DATA_LOCALNR(unsigned long p, int n) -{ - unsigned long temp; - temp = p >> PAGE_SHIFT; - return temp - PLAT_NODE_DATA(n)->gendata.node_start_pfn; -} -#endif - -/* - * Following are macros that each numa implementation must define. - */ - -/* - * Given a kernel address, find the home node of the underlying memory. - */ -#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) - -/* - * Given a kaddr, LOCAL_BASE_ADDR finds the owning node of the memory - * and returns the kaddr corresponding to first physical page in the - * node's mem_map. - */ -#define LOCAL_BASE_ADDR(kaddr) \ - ((unsigned long)__va(NODE_DATA(kvaddr_to_nid(kaddr))->node_start_pfn \ - << PAGE_SHIFT)) - -/* XXX: FIXME -- nyc */ -#define kern_addr_valid(kaddr) (0) - -#define mk_pte(page, pgprot) \ -({ \ - pte_t pte; \ - unsigned long pfn; \ - \ - pfn = page_to_pfn(page) << 32; \ - pte_val(pte) = pfn | pgprot_val(pgprot); \ - \ - pte; \ -}) - -#define pte_page(x) \ -({ \ - unsigned long kvirt; \ - struct page * __xx; \ - \ - kvirt = (unsigned long)__va(pte_val(x) >> (32-PAGE_SHIFT)); \ - __xx = virt_to_page(kvirt); \ - \ - __xx; \ -}) - -#define pfn_to_nid(pfn) pa_to_nid(((u64)(pfn) << PAGE_SHIFT)) -#define pfn_valid(pfn) \ - (((pfn) - node_start_pfn(pfn_to_nid(pfn))) < \ - node_spanned_pages(pfn_to_nid(pfn))) \ - -#endif /* CONFIG_DISCONTIGMEM */ - -#endif /* _ASM_MMZONE_H_ */ diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 8d856c62e22a..e1757b7cfe3d 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -206,7 +206,6 @@ extern unsigned long __zero_page(void); #define page_to_pa(page) (page_to_pfn(page) << PAGE_SHIFT) #define pte_pfn(pte) (pte_val(pte) >> 32) -#ifndef CONFIG_DISCONTIGMEM #define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define mk_pte(page, pgprot) \ ({ \ @@ -215,7 +214,6 @@ extern unsigned long __zero_page(void); pte_val(pte) = (page_to_pfn(page) << 32) | pgprot_val(pgprot); \ pte; \ }) -#endif extern inline pte_t pfn_pte(unsigned long physpfn, pgprot_t pgprot) { pte_t pte; pte_val(pte) = (PHYS_TWIDDLE(physpfn) << 32) | pgprot_val(pgprot); return pte; } @@ -330,9 +328,7 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#ifndef CONFIG_DISCONTIGMEM #define kern_addr_valid(addr) (1) -#endif #define pte_ERROR(e) \ printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) diff --git a/arch/alpha/include/asm/topology.h b/arch/alpha/include/asm/topology.h index 5a77a40567fa..7d393036aa8f 100644 --- a/arch/alpha/include/asm/topology.h +++ b/arch/alpha/include/asm/topology.h @@ -7,45 +7,6 @@ #include #include -#ifdef CONFIG_NUMA -static inline int cpu_to_node(int cpu) -{ - int node; - - if (!alpha_mv.cpuid_to_nid) - return 0; - - node = alpha_mv.cpuid_to_nid(cpu); - -#ifdef DEBUG_NUMA - BUG_ON(node < 0); -#endif - - return node; -} - -extern struct cpumask node_to_cpumask_map[]; -/* FIXME: This is dumb, recalculating every time. But simple. */ -static const struct cpumask *cpumask_of_node(int node) -{ - int cpu; - - if (node == NUMA_NO_NODE) - return cpu_all_mask; - - cpumask_clear(&node_to_cpumask_map[node]); - - for_each_online_cpu(cpu) { - if (cpu_to_node(cpu) == node) - cpumask_set_cpu(cpu, node_to_cpumask_map[node]); - } - - return &node_to_cpumask_map[node]; -} - -#define cpumask_of_pcibus(bus) (cpu_online_mask) - -#endif /* !CONFIG_NUMA */ # include #endif /* _ASM_ALPHA_TOPOLOGY_H */ diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index 4485b77f8658..1efca79ac83c 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -287,8 +287,7 @@ io7_init_hose(struct io7 *io7, int port) /* * Set up window 0 for scatter-gather 8MB at 8MB. */ - hose->sg_isa = iommu_arena_new_node(marvel_cpuid_to_nid(io7->pe), - hose, 0x00800000, 0x00800000, 0); + hose->sg_isa = iommu_arena_new_node(0, hose, 0x00800000, 0x00800000, 0); hose->sg_isa->align_entry = 8; /* cache line boundary */ csrs->POx_WBASE[0].csr = hose->sg_isa->dma_base | wbase_m_ena | wbase_m_sg; @@ -305,8 +304,7 @@ io7_init_hose(struct io7 *io7, int port) /* * Set up window 2 for scatter-gather (up-to) 1GB at 3GB. */ - hose->sg_pci = iommu_arena_new_node(marvel_cpuid_to_nid(io7->pe), - hose, 0xc0000000, 0x40000000, 0); + hose->sg_pci = iommu_arena_new_node(0, hose, 0xc0000000, 0x40000000, 0); hose->sg_pci->align_entry = 8; /* cache line boundary */ csrs->POx_WBASE[2].csr = hose->sg_pci->dma_base | wbase_m_ena | wbase_m_sg; @@ -843,53 +841,8 @@ EXPORT_SYMBOL(marvel_ioportmap); EXPORT_SYMBOL(marvel_ioread8); EXPORT_SYMBOL(marvel_iowrite8); #endif - -/* - * NUMA Support - */ -/********** - * FIXME - for now each cpu is a node by itself - * -- no real support for striped mode - ********** - */ -int -marvel_pa_to_nid(unsigned long pa) -{ - int cpuid; - if ((pa >> 43) & 1) /* I/O */ - cpuid = (~(pa >> 35) & 0xff); - else /* mem */ - cpuid = ((pa >> 34) & 0x3) | ((pa >> (37 - 2)) & (0x1f << 2)); - - return marvel_cpuid_to_nid(cpuid); -} - -int -marvel_cpuid_to_nid(int cpuid) -{ - return cpuid; -} - -unsigned long -marvel_node_mem_start(int nid) -{ - unsigned long pa; - - pa = (nid & 0x3) | ((nid & (0x1f << 2)) << 1); - pa <<= 34; - - return pa; -} - -unsigned long -marvel_node_mem_size(int nid) -{ - return 16UL * 1024 * 1024 * 1024; /* 16GB */ -} - - -/* +/* * AGP GART Support. */ #include diff --git a/arch/alpha/kernel/core_wildfire.c b/arch/alpha/kernel/core_wildfire.c index e8d3b033018d..3a804b67f9da 100644 --- a/arch/alpha/kernel/core_wildfire.c +++ b/arch/alpha/kernel/core_wildfire.c @@ -434,39 +434,12 @@ wildfire_write_config(struct pci_bus *bus, unsigned int devfn, int where, return PCIBIOS_SUCCESSFUL; } -struct pci_ops wildfire_pci_ops = +struct pci_ops wildfire_pci_ops = { .read = wildfire_read_config, .write = wildfire_write_config, }; - -/* - * NUMA Support - */ -int wildfire_pa_to_nid(unsigned long pa) -{ - return pa >> 36; -} - -int wildfire_cpuid_to_nid(int cpuid) -{ - /* assume 4 CPUs per node */ - return cpuid >> 2; -} - -unsigned long wildfire_node_mem_start(int nid) -{ - /* 64GB per node */ - return (unsigned long)nid * (64UL * 1024 * 1024 * 1024); -} - -unsigned long wildfire_node_mem_size(int nid) -{ - /* 64GB per node */ - return 64UL * 1024 * 1024 * 1024; -} - #if DEBUG_DUMP_REGS static void __init diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index d84b19aa8e9d..35d7b3096d6e 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -71,33 +71,6 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base, if (align < mem_size) align = mem_size; - -#ifdef CONFIG_DISCONTIGMEM - - arena = memblock_alloc_node(sizeof(*arena), align, nid); - if (!NODE_DATA(nid) || !arena) { - printk("%s: couldn't allocate arena from node %d\n" - " falling back to system-wide allocation\n", - __func__, nid); - arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES); - if (!arena) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*arena)); - } - - arena->ptes = memblock_alloc_node(sizeof(*arena), align, nid); - if (!NODE_DATA(nid) || !arena->ptes) { - printk("%s: couldn't allocate arena ptes from node %d\n" - " falling back to system-wide allocation\n", - __func__, nid); - arena->ptes = memblock_alloc(mem_size, align); - if (!arena->ptes) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, mem_size, align); - } - -#else /* CONFIG_DISCONTIGMEM */ - arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES); if (!arena) panic("%s: Failed to allocate %zu bytes\n", __func__, @@ -107,8 +80,6 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base, panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, mem_size, align); -#endif /* CONFIG_DISCONTIGMEM */ - spin_lock_init(&arena->lock); arena->hose = hose; arena->dma_base = base; diff --git a/arch/alpha/kernel/proto.h b/arch/alpha/kernel/proto.h index 701a05090141..5816a31c1b38 100644 --- a/arch/alpha/kernel/proto.h +++ b/arch/alpha/kernel/proto.h @@ -49,10 +49,6 @@ extern void marvel_init_arch(void); extern void marvel_kill_arch(int); extern void marvel_machine_check(unsigned long, unsigned long); extern void marvel_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t); -extern int marvel_pa_to_nid(unsigned long); -extern int marvel_cpuid_to_nid(int); -extern unsigned long marvel_node_mem_start(int); -extern unsigned long marvel_node_mem_size(int); extern struct _alpha_agp_info *marvel_agp_info(void); struct io7 *marvel_find_io7(int pe); struct io7 *marvel_next_io7(struct io7 *prev); @@ -101,10 +97,6 @@ extern void wildfire_init_arch(void); extern void wildfire_kill_arch(int); extern void wildfire_machine_check(unsigned long vector, unsigned long la_ptr); extern void wildfire_pci_tbi(struct pci_controller *, dma_addr_t, dma_addr_t); -extern int wildfire_pa_to_nid(unsigned long); -extern int wildfire_cpuid_to_nid(int); -extern unsigned long wildfire_node_mem_start(int); -extern unsigned long wildfire_node_mem_size(int); /* console.c */ #ifdef CONFIG_VGA_HOSE diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 03dda3beb3bd..5f6858e9dc28 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -79,11 +79,6 @@ int alpha_l3_cacheshape; unsigned long alpha_verbose_mcheck = CONFIG_VERBOSE_MCHECK_ON; #endif -#ifdef CONFIG_NUMA -struct cpumask node_to_cpumask_map[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_to_cpumask_map); -#endif - /* Which processor we booted from. */ int boot_cpuid; @@ -305,7 +300,6 @@ move_initrd(unsigned long mem_limit) } #endif -#ifndef CONFIG_DISCONTIGMEM static void __init setup_memory(void *kernel_end) { @@ -389,9 +383,6 @@ setup_memory(void *kernel_end) } #endif /* CONFIG_BLK_DEV_INITRD */ } -#else -extern void setup_memory(void *); -#endif /* !CONFIG_DISCONTIGMEM */ int __init page_is_ram(unsigned long pfn) @@ -618,13 +609,6 @@ setup_arch(char **cmdline_p) "VERBOSE_MCHECK " #endif -#ifdef CONFIG_DISCONTIGMEM - "DISCONTIGMEM " -#ifdef CONFIG_NUMA - "NUMA " -#endif -#endif - #ifdef CONFIG_DEBUG_SPINLOCK "DEBUG_SPINLOCK " #endif diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 83d6c53d6d4d..1f99b03effc2 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -461,10 +461,5 @@ struct alpha_machine_vector marvel_ev7_mv __initmv = { .kill_arch = marvel_kill_arch, .pci_map_irq = marvel_map_irq, .pci_swizzle = common_swizzle, - - .pa_to_nid = marvel_pa_to_nid, - .cpuid_to_nid = marvel_cpuid_to_nid, - .node_mem_start = marvel_node_mem_start, - .node_mem_size = marvel_node_mem_size, }; ALIAS_MV(marvel_ev7) diff --git a/arch/alpha/kernel/sys_wildfire.c b/arch/alpha/kernel/sys_wildfire.c index 2c54d707142a..3cee05443f07 100644 --- a/arch/alpha/kernel/sys_wildfire.c +++ b/arch/alpha/kernel/sys_wildfire.c @@ -337,10 +337,5 @@ struct alpha_machine_vector wildfire_mv __initmv = { .kill_arch = wildfire_kill_arch, .pci_map_irq = wildfire_map_irq, .pci_swizzle = common_swizzle, - - .pa_to_nid = wildfire_pa_to_nid, - .cpuid_to_nid = wildfire_cpuid_to_nid, - .node_mem_start = wildfire_node_mem_start, - .node_mem_size = wildfire_node_mem_size, }; ALIAS_MV(wildfire) diff --git a/arch/alpha/mm/Makefile b/arch/alpha/mm/Makefile index 08ac6612edad..bd770302eb82 100644 --- a/arch/alpha/mm/Makefile +++ b/arch/alpha/mm/Makefile @@ -6,5 +6,3 @@ ccflags-y := -Werror obj-y := init.o fault.o - -obj-$(CONFIG_DISCONTIGMEM) += numa.o diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index a97650a618f1..f6114d03357c 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -235,8 +235,6 @@ callback_init(void * kernel_end) return kernel_end; } - -#ifndef CONFIG_DISCONTIGMEM /* * paging_init() sets up the memory map. */ @@ -257,7 +255,6 @@ void __init paging_init(void) /* Initialize the kernel's ZERO_PGE. */ memset((void *)ZERO_PGE, 0, PAGE_SIZE); } -#endif /* CONFIG_DISCONTIGMEM */ #if defined(CONFIG_ALPHA_GENERIC) || defined(CONFIG_ALPHA_SRM) void diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c deleted file mode 100644 index 0636e254a22f..000000000000 --- a/arch/alpha/mm/numa.c +++ /dev/null @@ -1,223 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/arch/alpha/mm/numa.c - * - * DISCONTIGMEM NUMA alpha support. - * - * Copyright (C) 2001 Andrea Arcangeli SuSE - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -pg_data_t node_data[MAX_NUMNODES]; -EXPORT_SYMBOL(node_data); - -#undef DEBUG_DISCONTIG -#ifdef DEBUG_DISCONTIG -#define DBGDCONT(args...) printk(args) -#else -#define DBGDCONT(args...) -#endif - -#define for_each_mem_cluster(memdesc, _cluster, i) \ - for ((_cluster) = (memdesc)->cluster, (i) = 0; \ - (i) < (memdesc)->numclusters; (i)++, (_cluster)++) - -static void __init show_mem_layout(void) -{ - struct memclust_struct * cluster; - struct memdesc_struct * memdesc; - int i; - - /* Find free clusters, and init and free the bootmem accordingly. */ - memdesc = (struct memdesc_struct *) - (hwrpb->mddt_offset + (unsigned long) hwrpb); - - printk("Raw memory layout:\n"); - for_each_mem_cluster(memdesc, cluster, i) { - printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n", - i, cluster->usage, cluster->start_pfn, - cluster->start_pfn + cluster->numpages); - } -} - -static void __init -setup_memory_node(int nid, void *kernel_end) -{ - extern unsigned long mem_size_limit; - struct memclust_struct * cluster; - struct memdesc_struct * memdesc; - unsigned long start_kernel_pfn, end_kernel_pfn; - unsigned long start, end; - unsigned long node_pfn_start, node_pfn_end; - unsigned long node_min_pfn, node_max_pfn; - int i; - int show_init = 0; - - /* Find the bounds of current node */ - node_pfn_start = (node_mem_start(nid)) >> PAGE_SHIFT; - node_pfn_end = node_pfn_start + (node_mem_size(nid) >> PAGE_SHIFT); - - /* Find free clusters, and init and free the bootmem accordingly. */ - memdesc = (struct memdesc_struct *) - (hwrpb->mddt_offset + (unsigned long) hwrpb); - - /* find the bounds of this node (node_min_pfn/node_max_pfn) */ - node_min_pfn = ~0UL; - node_max_pfn = 0UL; - for_each_mem_cluster(memdesc, cluster, i) { - /* Bit 0 is console/PALcode reserved. Bit 1 is - non-volatile memory -- we might want to mark - this for later. */ - if (cluster->usage & 3) - continue; - - start = cluster->start_pfn; - end = start + cluster->numpages; - - if (start >= node_pfn_end || end <= node_pfn_start) - continue; - - if (!show_init) { - show_init = 1; - printk("Initializing bootmem allocator on Node ID %d\n", nid); - } - printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n", - i, cluster->usage, cluster->start_pfn, - cluster->start_pfn + cluster->numpages); - - if (start < node_pfn_start) - start = node_pfn_start; - if (end > node_pfn_end) - end = node_pfn_end; - - if (start < node_min_pfn) - node_min_pfn = start; - if (end > node_max_pfn) - node_max_pfn = end; - } - - if (mem_size_limit && node_max_pfn > mem_size_limit) { - static int msg_shown = 0; - if (!msg_shown) { - msg_shown = 1; - printk("setup: forcing memory size to %ldK (from %ldK).\n", - mem_size_limit << (PAGE_SHIFT - 10), - node_max_pfn << (PAGE_SHIFT - 10)); - } - node_max_pfn = mem_size_limit; - } - - if (node_min_pfn >= node_max_pfn) - return; - - /* Update global {min,max}_low_pfn from node information. */ - if (node_min_pfn < min_low_pfn) - min_low_pfn = node_min_pfn; - if (node_max_pfn > max_low_pfn) - max_pfn = max_low_pfn = node_max_pfn; - -#if 0 /* we'll try this one again in a little while */ - /* Cute trick to make sure our local node data is on local memory */ - node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT)); -#endif - printk(" Detected node memory: start %8lu, end %8lu\n", - node_min_pfn, node_max_pfn); - - DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid)); - - /* Find the bounds of kernel memory. */ - start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); - end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); - - if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) - panic("kernel loaded out of ram"); - - memblock_add_node(PFN_PHYS(node_min_pfn), - (node_max_pfn - node_min_pfn) << PAGE_SHIFT, nid); - - /* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned. - Note that we round this down, not up - node memory - has much larger alignment than 8Mb, so it's safe. */ - node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1); - - NODE_DATA(nid)->node_start_pfn = node_min_pfn; - NODE_DATA(nid)->node_present_pages = node_max_pfn - node_min_pfn; - - node_set_online(nid); -} - -void __init -setup_memory(void *kernel_end) -{ - unsigned long kernel_size; - int nid; - - show_mem_layout(); - - nodes_clear(node_online_map); - - min_low_pfn = ~0UL; - max_low_pfn = 0UL; - for (nid = 0; nid < MAX_NUMNODES; nid++) - setup_memory_node(nid, kernel_end); - - kernel_size = virt_to_phys(kernel_end) - KERNEL_START_PHYS; - memblock_reserve(KERNEL_START_PHYS, kernel_size); - -#ifdef CONFIG_BLK_DEV_INITRD - initrd_start = INITRD_START; - if (initrd_start) { - extern void *move_initrd(unsigned long); - - initrd_end = initrd_start+INITRD_SIZE; - printk("Initial ramdisk at: 0x%p (%lu bytes)\n", - (void *) initrd_start, INITRD_SIZE); - - if ((void *)initrd_end > phys_to_virt(PFN_PHYS(max_low_pfn))) { - if (!move_initrd(PFN_PHYS(max_low_pfn))) - printk("initrd extends beyond end of memory " - "(0x%08lx > 0x%p)\ndisabling initrd\n", - initrd_end, - phys_to_virt(PFN_PHYS(max_low_pfn))); - } else { - nid = kvaddr_to_nid(initrd_start); - memblock_reserve(virt_to_phys((void *)initrd_start), - INITRD_SIZE); - } - } -#endif /* CONFIG_BLK_DEV_INITRD */ -} - -void __init paging_init(void) -{ - unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, }; - unsigned long dma_local_pfn; - - /* - * The old global MAX_DMA_ADDRESS per-arch API doesn't fit - * in the NUMA model, for now we convert it to a pfn and - * we interpret this pfn as a local per-node information. - * This issue isn't very important since none of these machines - * have legacy ISA slots anyways. - */ - dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - - max_zone_pfn[ZONE_DMA] = dma_local_pfn; - max_zone_pfn[ZONE_NORMAL] = max_pfn; - - free_area_init(max_zone_pfn); - - /* Initialize the kernel's ZERO_PGE. */ - memset((void *)ZERO_PGE, 0, PAGE_SIZE); -} -- cgit From e7793e53901b31a06db534679e77c0cdeab260a2 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:42:43 -0700 Subject: arc: update comment about HIGHMEM implementation Arc does not use DISCONTIGMEM to implement high memory, update the comment describing how high memory works to reflect this. Link: https://lkml.kernel.org/r/20210608091316.3622-3-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Vineet Gupta Acked-by: Arnd Bergmann Reviewed-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/mm/init.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index e2ed355438c9..397a201adfe3 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -139,16 +139,13 @@ void __init setup_arch_memory(void) #ifdef CONFIG_HIGHMEM /* - * Populate a new node with highmem - * * On ARC (w/o PAE) HIGHMEM addresses are actually smaller (0 based) - * than addresses in normal ala low memory (0x8000_0000 based). + * than addresses in normal aka low memory (0x8000_0000 based). * Even with PAE, the huge peripheral space hole would waste a lot of - * mem with single mem_map[]. This warrants a mem_map per region design. - * Thus HIGHMEM on ARC is imlemented with DISCONTIGMEM. - * - * DISCONTIGMEM in turns requires multiple nodes. node 0 above is - * populated with normal memory zone while node 1 only has highmem + * mem with single contiguous mem_map[]. + * Thus when HIGHMEM on ARC is enabled the memory map corresponding + * to the hole is freed and ARC specific version of pfn_valid() + * handles the hole in the memory map. */ #ifdef CONFIG_DISCONTIGMEM node_set_online(1); -- cgit From 8b793b442051550b6cc694213e276587e01bddcb Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:42:46 -0700 Subject: arc: remove support for DISCONTIGMEM DISCONTIGMEM was replaced by FLATMEM with freeing of the unused memory map in v5.11. Remove the support for DISCONTIGMEM entirely. Link: https://lkml.kernel.org/r/20210608091316.3622-4-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Vineet Gupta Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/Kconfig | 13 ------------- arch/arc/include/asm/mmzone.h | 40 ---------------------------------------- arch/arc/mm/init.c | 8 -------- 3 files changed, 61 deletions(-) delete mode 100644 arch/arc/include/asm/mmzone.h diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 2d98501c0897..d8f51eb8963b 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -62,10 +62,6 @@ config SCHED_OMIT_FRAME_POINTER config GENERIC_CSUM def_bool y -config ARCH_DISCONTIGMEM_ENABLE - def_bool n - depends on BROKEN - config ARCH_FLATMEM_ENABLE def_bool y @@ -344,15 +340,6 @@ config ARC_HUGEPAGE_16M endchoice -config NODES_SHIFT - int "Maximum NUMA Nodes (as a power of 2)" - default "0" if !DISCONTIGMEM - default "1" if DISCONTIGMEM - depends on NEED_MULTIPLE_NODES - help - Accessing memory beyond 1GB (with or w/o PAE) requires 2 memory - zones. - config ARC_COMPACT_IRQ_LEVELS depends on ISA_ARCOMPACT bool "Setup Timer IRQ as high Priority" diff --git a/arch/arc/include/asm/mmzone.h b/arch/arc/include/asm/mmzone.h deleted file mode 100644 index b86b9d1e54dc..000000000000 --- a/arch/arc/include/asm/mmzone.h +++ /dev/null @@ -1,40 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2016 Synopsys, Inc. (www.synopsys.com) - */ - -#ifndef _ASM_ARC_MMZONE_H -#define _ASM_ARC_MMZONE_H - -#ifdef CONFIG_DISCONTIGMEM - -extern struct pglist_data node_data[]; -#define NODE_DATA(nid) (&node_data[nid]) - -static inline int pfn_to_nid(unsigned long pfn) -{ - int is_end_low = 1; - - if (IS_ENABLED(CONFIG_ARC_HAS_PAE40)) - is_end_low = pfn <= virt_to_pfn(0xFFFFFFFFUL); - - /* - * node 0: lowmem: 0x8000_0000 to 0xFFFF_FFFF - * node 1: HIGHMEM w/o PAE40: 0x0 to 0x7FFF_FFFF - * HIGHMEM with PAE40: 0x1_0000_0000 to ... - */ - if (pfn >= ARCH_PFN_OFFSET && is_end_low) - return 0; - - return 1; -} - -static inline int pfn_valid(unsigned long pfn) -{ - int nid = pfn_to_nid(pfn); - - return (pfn <= node_end_pfn(nid)); -} -#endif /* CONFIG_DISCONTIGMEM */ - -#endif diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 397a201adfe3..abfeef7bf6f8 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -32,11 +32,6 @@ unsigned long arch_pfn_offset; EXPORT_SYMBOL(arch_pfn_offset); #endif -#ifdef CONFIG_DISCONTIGMEM -struct pglist_data node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); -#endif - long __init arc_get_mem_sz(void) { return low_mem_sz; @@ -147,9 +142,6 @@ void __init setup_arch_memory(void) * to the hole is freed and ARC specific version of pfn_valid() * handles the hole in the memory map. */ -#ifdef CONFIG_DISCONTIGMEM - node_set_online(1); -#endif min_high_pfn = PFN_DOWN(high_mem_start); max_high_pfn = PFN_DOWN(high_mem_start + high_mem_sz); -- cgit From 5ab06e10990c3a04e00318c5ca93048c0f53a0a7 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:42:49 -0700 Subject: m68k: remove support for DISCONTIGMEM DISCONTIGMEM was replaced by FLATMEM with freeing of the unused memory map in v5.11. Remove the support for DISCONTIGMEM entirely. Link: https://lkml.kernel.org/r/20210608091316.3622-5-rppt@kernel.org Signed-off-by: Mike Rapoport Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Acked-by: Arnd Bergmann Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m68k/Kconfig.cpu | 10 ---------- arch/m68k/include/asm/mmzone.h | 10 ---------- arch/m68k/include/asm/page.h | 2 +- arch/m68k/include/asm/page_mm.h | 35 ----------------------------------- arch/m68k/mm/init.c | 20 -------------------- 5 files changed, 1 insertion(+), 76 deletions(-) delete mode 100644 arch/m68k/include/asm/mmzone.h diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index f4d23977d2a5..29e946394fdb 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -408,10 +408,6 @@ config SINGLE_MEMORY_CHUNK order" to save memory that could be wasted for unused memory map. Say N if not sure. -config ARCH_DISCONTIGMEM_ENABLE - depends on BROKEN - def_bool MMU && !SINGLE_MEMORY_CHUNK - config FORCE_MAX_ZONEORDER int "Maximum zone order" if ADVANCED depends on !SINGLE_MEMORY_CHUNK @@ -451,11 +447,6 @@ config M68K_L2_CACHE depends on MAC default y -config NODES_SHIFT - int - default "3" - depends on DISCONTIGMEM - config CPU_HAS_NO_BITFIELDS bool @@ -553,4 +544,3 @@ config CACHE_COPYBACK The ColdFire CPU cache is set into Copy-back mode. endchoice endif - diff --git a/arch/m68k/include/asm/mmzone.h b/arch/m68k/include/asm/mmzone.h deleted file mode 100644 index 64573fe8e60d..000000000000 --- a/arch/m68k/include/asm/mmzone.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_M68K_MMZONE_H_ -#define _ASM_M68K_MMZONE_H_ - -extern pg_data_t pg_data_map[]; - -#define NODE_DATA(nid) (&pg_data_map[nid]) -#define NODE_MEM_MAP(nid) (NODE_DATA(nid)->node_mem_map) - -#endif /* _ASM_M68K_MMZONE_H_ */ diff --git a/arch/m68k/include/asm/page.h b/arch/m68k/include/asm/page.h index 97087dd3ca6d..2f1c54e4725d 100644 --- a/arch/m68k/include/asm/page.h +++ b/arch/m68k/include/asm/page.h @@ -62,7 +62,7 @@ extern unsigned long _ramend; #include #endif -#if !defined(CONFIG_MMU) || defined(CONFIG_DISCONTIGMEM) +#ifndef CONFIG_MMU #define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT)) #define __pfn_to_phys(pfn) PFN_PHYS(pfn) #endif diff --git a/arch/m68k/include/asm/page_mm.h b/arch/m68k/include/asm/page_mm.h index 2411ea9ef578..a5b459bcb7d8 100644 --- a/arch/m68k/include/asm/page_mm.h +++ b/arch/m68k/include/asm/page_mm.h @@ -126,26 +126,6 @@ static inline void *__va(unsigned long x) extern int m68k_virt_to_node_shift; -#ifndef CONFIG_DISCONTIGMEM -#define __virt_to_node(addr) (&pg_data_map[0]) -#else -extern struct pglist_data *pg_data_table[]; - -static inline __attribute_const__ int __virt_to_node_shift(void) -{ - int shift; - - asm ( - "1: moveq #0,%0\n" - m68k_fixup(%c1, 1b) - : "=d" (shift) - : "i" (m68k_fixup_vnode_shift)); - return shift; -} - -#define __virt_to_node(addr) (pg_data_table[(unsigned long)(addr) >> __virt_to_node_shift()]) -#endif - #define virt_to_page(addr) ({ \ pfn_to_page(virt_to_pfn(addr)); \ }) @@ -153,23 +133,8 @@ static inline __attribute_const__ int __virt_to_node_shift(void) pfn_to_virt(page_to_pfn(page)); \ }) -#ifdef CONFIG_DISCONTIGMEM -#define pfn_to_page(pfn) ({ \ - unsigned long __pfn = (pfn); \ - struct pglist_data *pgdat; \ - pgdat = __virt_to_node((unsigned long)pfn_to_virt(__pfn)); \ - pgdat->node_mem_map + (__pfn - pgdat->node_start_pfn); \ -}) -#define page_to_pfn(_page) ({ \ - const struct page *__p = (_page); \ - struct pglist_data *pgdat; \ - pgdat = &pg_data_map[page_to_nid(__p)]; \ - ((__p) - pgdat->node_mem_map) + pgdat->node_start_pfn; \ -}) -#else #define ARCH_PFN_OFFSET (m68k_memory[0].addr >> PAGE_SHIFT) #include -#endif #define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory) #define pfn_valid(pfn) virt_addr_valid(pfn_to_virt(pfn)) diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 1759ab875d47..5d749e188246 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -44,28 +44,8 @@ EXPORT_SYMBOL(empty_zero_page); int m68k_virt_to_node_shift; -#ifdef CONFIG_DISCONTIGMEM -pg_data_t pg_data_map[MAX_NUMNODES]; -EXPORT_SYMBOL(pg_data_map); - -pg_data_t *pg_data_table[65]; -EXPORT_SYMBOL(pg_data_table); -#endif - void __init m68k_setup_node(int node) { -#ifdef CONFIG_DISCONTIGMEM - struct m68k_mem_info *info = m68k_memory + node; - int i, end; - - i = (unsigned long)phys_to_virt(info->addr) >> __virt_to_node_shift(); - end = (unsigned long)phys_to_virt(info->addr + info->size - 1) >> __virt_to_node_shift(); - for (; i <= end; i++) { - if (pg_data_table[i]) - pr_warn("overlap at %u for chunk %u\n", i, node); - pg_data_table[i] = pg_data_map + node; - } -#endif node_set_online(node); } -- cgit From bb1c50d3967f69f413b333713c2718d48d1ab7ea Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:42:52 -0700 Subject: mm: remove CONFIG_DISCONTIGMEM There are no architectures that support DISCONTIGMEM left. Remove the configuration option and the dead code it was guarding in the generic memory management code. Link: https://lkml.kernel.org/r/20210608091316.3622-6-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/memory_model.h | 37 ++++--------------------------------- include/linux/mmzone.h | 8 +++++--- mm/Kconfig | 25 +++---------------------- mm/page_alloc.c | 13 ------------- 4 files changed, 12 insertions(+), 71 deletions(-) diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 7637fb46ba4f..a2c8ed60233a 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -6,47 +6,18 @@ #ifndef __ASSEMBLY__ +/* + * supports 3 memory models. + */ #if defined(CONFIG_FLATMEM) #ifndef ARCH_PFN_OFFSET #define ARCH_PFN_OFFSET (0UL) #endif -#elif defined(CONFIG_DISCONTIGMEM) - -#ifndef arch_pfn_to_nid -#define arch_pfn_to_nid(pfn) pfn_to_nid(pfn) -#endif - -#ifndef arch_local_page_offset -#define arch_local_page_offset(pfn, nid) \ - ((pfn) - NODE_DATA(nid)->node_start_pfn) -#endif - -#endif /* CONFIG_DISCONTIGMEM */ - -/* - * supports 3 memory models. - */ -#if defined(CONFIG_FLATMEM) - #define __pfn_to_page(pfn) (mem_map + ((pfn) - ARCH_PFN_OFFSET)) #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) -#elif defined(CONFIG_DISCONTIGMEM) - -#define __pfn_to_page(pfn) \ -({ unsigned long __pfn = (pfn); \ - unsigned long __nid = arch_pfn_to_nid(__pfn); \ - NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\ -}) - -#define __page_to_pfn(pg) \ -({ const struct page *__pg = (pg); \ - struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg)); \ - (unsigned long)(__pg - __pgdat->node_mem_map) + \ - __pgdat->node_start_pfn; \ -}) #elif defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -70,7 +41,7 @@ struct mem_section *__sec = __pfn_to_section(__pfn); \ __section_mem_map_addr(__sec) + __pfn; \ }) -#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */ +#endif /* CONFIG_FLATMEM/SPARSEMEM */ /* * Convert a physical address to a Page Frame Number and back diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3e62e8ef68b5..6f9829562af2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -749,10 +749,12 @@ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; -#ifndef CONFIG_DISCONTIGMEM -/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +/* + * The array of struct pages for flatmem. + * It must be declared for SPARSEMEM as well because there are configurations + * that rely on that. + */ extern struct page *mem_map; -#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { diff --git a/mm/Kconfig b/mm/Kconfig index 02d44e3420f5..218b96ccc84a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -19,7 +19,7 @@ choice config FLATMEM_MANUAL bool "Flat Memory" - depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE + depends on !ARCH_SPARSEMEM_ENABLE || ARCH_FLATMEM_ENABLE help This option is best suited for non-NUMA systems with flat address space. The FLATMEM is the most efficient @@ -32,21 +32,6 @@ config FLATMEM_MANUAL If unsure, choose this option (Flat Memory) over any other. -config DISCONTIGMEM_MANUAL - bool "Discontiguous Memory" - depends on ARCH_DISCONTIGMEM_ENABLE - help - This option provides enhanced support for discontiguous - memory systems, over FLATMEM. These systems have holes - in their physical address spaces, and this option provides - more efficient handling of these holes. - - Although "Discontiguous Memory" is still used by several - architectures, it is considered deprecated in favor of - "Sparse Memory". - - If unsure, choose "Sparse Memory" over this option. - config SPARSEMEM_MANUAL bool "Sparse Memory" depends on ARCH_SPARSEMEM_ENABLE @@ -62,17 +47,13 @@ config SPARSEMEM_MANUAL endchoice -config DISCONTIGMEM - def_bool y - depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL - config SPARSEMEM def_bool y depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL config FLATMEM def_bool y - depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL + depends on !SPARSEMEM || FLATMEM_MANUAL config FLAT_NODE_MEM_MAP def_bool y @@ -85,7 +66,7 @@ config FLAT_NODE_MEM_MAP # config NEED_MULTIPLE_NODES def_bool y - depends on DISCONTIGMEM || NUMA + depends on NUMA # # SPARSEMEM_EXTREME (which is the default) does some bootmem diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 58f7a321598f..8926f3fd3bcf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -349,20 +349,7 @@ compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; -#ifdef CONFIG_DISCONTIGMEM -/* - * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges - * are not on separate NUMA nodes. Functionally this works but with - * watermark_boost_factor, it can reclaim prematurely as the ranges can be - * quite small. By default, do not boost watermarks on discontigmem as in - * many cases very high-order allocations like THP are likely to be - * unsupported and the premature reclaim offsets the advantage of long-term - * fragmentation avoidance. - */ -int watermark_boost_factor __read_mostly; -#else int watermark_boost_factor __read_mostly = 15000; -#endif int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __initdata; -- cgit From d3c251ab95b69f3dc189c4657baeac1b4c050789 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:42:55 -0700 Subject: arch, mm: remove stale mentions of DISCONIGMEM There are several places that mention DISCONIGMEM in comments or have stale code guarded by CONFIG_DISCONTIGMEM. Remove the dead code and update the comments. Link: https://lkml.kernel.org/r/20210608091316.3622-7-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Reviewed-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/topology.c | 5 ++--- arch/ia64/mm/numa.c | 5 ++--- arch/mips/include/asm/mmzone.h | 6 ------ arch/mips/mm/init.c | 3 --- arch/nds32/include/asm/memory.h | 6 ------ arch/xtensa/include/asm/page.h | 4 ---- include/linux/gfp.h | 4 ++-- 7 files changed, 6 insertions(+), 27 deletions(-) diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 09fc385c2acd..3639e0a7cb3b 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -3,9 +3,8 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * This file contains NUMA specific variables and functions which can - * be split away from DISCONTIGMEM and are used on NUMA machines with - * contiguous memory. + * This file contains NUMA specific variables and functions which are used on + * NUMA machines with contiguous memory. * 2002/08/07 Erich Focht * Populate cpu entries in sysfs for non-numa systems as well * Intel Corporation - Ashok Raj diff --git a/arch/ia64/mm/numa.c b/arch/ia64/mm/numa.c index 46b6e5f3a40f..d6579ec3ea32 100644 --- a/arch/ia64/mm/numa.c +++ b/arch/ia64/mm/numa.c @@ -3,9 +3,8 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * This file contains NUMA specific variables and functions which can - * be split away from DISCONTIGMEM and are used on NUMA machines with - * contiguous memory. + * This file contains NUMA specific variables and functions which are used on + * NUMA machines with contiguous memory. * * 2002/08/07 Erich Focht */ diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h index b826b8473e95..7649ab45e80c 100644 --- a/arch/mips/include/asm/mmzone.h +++ b/arch/mips/include/asm/mmzone.h @@ -20,10 +20,4 @@ #define nid_to_addrbase(nid) 0 #endif -#ifdef CONFIG_DISCONTIGMEM - -#define pfn_to_nid(pfn) pa_to_nid((pfn) << PAGE_SHIFT) - -#endif /* CONFIG_DISCONTIGMEM */ - #endif /* _ASM_MMZONE_H_ */ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index c36358758969..97f6ca341448 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -454,9 +454,6 @@ void __init mem_init(void) BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (_PFN_SHIFT > PAGE_SHIFT)); #ifdef CONFIG_HIGHMEM -#ifdef CONFIG_DISCONTIGMEM -#error "CONFIG_HIGHMEM and CONFIG_DISCONTIGMEM dont work together yet" -#endif max_mapnr = highend_pfn ? highend_pfn : max_low_pfn; #else max_mapnr = max_low_pfn; diff --git a/arch/nds32/include/asm/memory.h b/arch/nds32/include/asm/memory.h index 940d32842793..62faafbc28e4 100644 --- a/arch/nds32/include/asm/memory.h +++ b/arch/nds32/include/asm/memory.h @@ -76,18 +76,12 @@ * virt_to_page(k) convert a _valid_ virtual address to struct page * * virt_addr_valid(k) indicates whether a virtual address is valid */ -#ifndef CONFIG_DISCONTIGMEM - #define ARCH_PFN_OFFSET PHYS_PFN_OFFSET #define pfn_valid(pfn) ((pfn) >= PHYS_PFN_OFFSET && (pfn) < (PHYS_PFN_OFFSET + max_mapnr)) #define virt_to_page(kaddr) (pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)) #define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory) -#else /* CONFIG_DISCONTIGMEM */ -#error CONFIG_DISCONTIGMEM is not supported yet. -#endif /* !CONFIG_DISCONTIGMEM */ - #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #endif diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index 37ce25ef92d6..493eb7083b1a 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -192,10 +192,6 @@ static inline unsigned long ___pa(unsigned long va) #define pfn_valid(pfn) \ ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) -#ifdef CONFIG_DISCONTIGMEM -# error CONFIG_DISCONTIGMEM not supported -#endif - #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define page_to_virt(page) __va(page_to_pfn(page) << PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 94f0b8b1cb55..0bec15b0691f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -494,8 +494,8 @@ static inline int gfp_zonelist(gfp_t flags) * There are two zonelists per node, one for all zones with memory and * one containing just zones from the node the zonelist belongs to. * - * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets - * optimized to &contig_page_data at compile-time. + * For the case of non-NUMA systems the NODE_DATA() gets optimized to + * &contig_page_data at compile-time. */ static inline struct zonelist *node_zonelist(int nid, gfp_t flags) { -- cgit From 48d9f3355a8eaa79b00472929b517df497fc6d5f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:42:58 -0700 Subject: docs: remove description of DISCONTIGMEM Remove description of DISCONTIGMEM from the "Memory Models" document and update VM sysctl description so that it won't mention DISCONIGMEM. Link: https://lkml.kernel.org/r/20210608091316.3622-8-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Reviewed-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 12 ++++----- Documentation/vm/memory-model.rst | 45 ++------------------------------- 2 files changed, 8 insertions(+), 49 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 2da25735a629..8387ad0b0b83 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -938,12 +938,12 @@ allocations, THP and hugetlbfs pages. To make it sensible with respect to the watermark_scale_factor parameter, the unit is in fractions of 10,000. The default value of -15,000 on !DISCONTIGMEM configurations means that up to 150% of the high -watermark will be reclaimed in the event of a pageblock being mixed due -to fragmentation. The level of reclaim is determined by the number of -fragmentation events that occurred in the recent past. If this value is -smaller than a pageblock then a pageblocks worth of pages will be reclaimed -(e.g. 2MB on 64-bit x86). A boost factor of 0 will disable the feature. +15,000 means that up to 150% of the high watermark will be reclaimed in the +event of a pageblock being mixed due to fragmentation. The level of reclaim +is determined by the number of fragmentation events that occurred in the +recent past. If this value is smaller than a pageblock then a pageblocks +worth of pages will be reclaimed (e.g. 2MB on 64-bit x86). A boost factor +of 0 will disable the feature. watermark_scale_factor diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst index ce398a7dc6cd..30e8fbed6914 100644 --- a/Documentation/vm/memory-model.rst +++ b/Documentation/vm/memory-model.rst @@ -14,15 +14,11 @@ for the CPU. Then there could be several contiguous ranges at completely distinct addresses. And, don't forget about NUMA, where different memory banks are attached to different CPUs. -Linux abstracts this diversity using one of the three memory models: -FLATMEM, DISCONTIGMEM and SPARSEMEM. Each architecture defines what +Linux abstracts this diversity using one of the two memory models: +FLATMEM and SPARSEMEM. Each architecture defines what memory models it supports, what the default memory model is and whether it is possible to manually override that default. -.. note:: - At time of this writing, DISCONTIGMEM is considered deprecated, - although it is still in use by several architectures. - All the memory models track the status of physical page frames using struct page arranged in one or more arrays. @@ -63,43 +59,6 @@ straightforward: `PFN - ARCH_PFN_OFFSET` is an index to the The `ARCH_PFN_OFFSET` defines the first page frame number for systems with physical memory starting at address different from 0. -DISCONTIGMEM -============ - -The DISCONTIGMEM model treats the physical memory as a collection of -`nodes` similarly to how Linux NUMA support does. For each node Linux -constructs an independent memory management subsystem represented by -`struct pglist_data` (or `pg_data_t` for short). Among other -things, `pg_data_t` holds the `node_mem_map` array that maps -physical pages belonging to that node. The `node_start_pfn` field of -`pg_data_t` is the number of the first page frame belonging to that -node. - -The architecture setup code should call :c:func:`free_area_init_node` for -each node in the system to initialize the `pg_data_t` object and its -`node_mem_map`. - -Every `node_mem_map` behaves exactly as FLATMEM's `mem_map` - -every physical page frame in a node has a `struct page` entry in the -`node_mem_map` array. When DISCONTIGMEM is enabled, a portion of the -`flags` field of the `struct page` encodes the node number of the -node hosting that page. - -The conversion between a PFN and the `struct page` in the -DISCONTIGMEM model became slightly more complex as it has to determine -which node hosts the physical page and which `pg_data_t` object -holds the `struct page`. - -Architectures that support DISCONTIGMEM provide :c:func:`pfn_to_nid` -to convert PFN to the node number. The opposite conversion helper -:c:func:`page_to_nid` is generic as it uses the node number encoded in -page->flags. - -Once the node number is known, the PFN can be used to index -appropriate `node_mem_map` array to access the `struct page` and -the offset of the `struct page` from the `node_mem_map` plus -`node_start_pfn` is the PFN of that page. - SPARSEMEM ========= -- cgit From a9ee6cf5c60ed1070e786e53665f9b2f23f2bd11 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:43:01 -0700 Subject: mm: replace CONFIG_NEED_MULTIPLE_NODES with CONFIG_NUMA After removal of DISCINTIGMEM the NEED_MULTIPLE_NODES and NUMA configuration options are equivalent. Drop CONFIG_NEED_MULTIPLE_NODES and use CONFIG_NUMA instead. Done with $ sed -i 's/CONFIG_NEED_MULTIPLE_NODES/CONFIG_NUMA/' \ $(git grep -wl CONFIG_NEED_MULTIPLE_NODES) $ sed -i 's/NEED_MULTIPLE_NODES/NUMA/' \ $(git grep -wl NEED_MULTIPLE_NODES) with manual tweaks afterwards. [rppt@linux.ibm.com: fix arm boot crash] Link: https://lkml.kernel.org/r/YMj9vHhHOiCVN4BF@linux.ibm.com Link: https://lkml.kernel.org/r/20210608091316.3622-9-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- arch/ia64/Kconfig | 2 +- arch/mips/Kconfig | 2 +- arch/mips/include/asm/mmzone.h | 2 +- arch/mips/include/asm/page.h | 2 +- arch/mips/mm/init.c | 4 ++-- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/mmzone.h | 4 ++-- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/kexec/core.c | 4 ++-- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/mem.c | 4 ++-- arch/riscv/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/sh/include/asm/mmzone.h | 4 ++-- arch/sh/kernel/topology.c | 2 +- arch/sh/mm/Kconfig | 2 +- arch/sh/mm/init.c | 2 +- arch/sparc/Kconfig | 2 +- arch/sparc/include/asm/mmzone.h | 4 ++-- arch/sparc/kernel/smp_64.c | 2 +- arch/sparc/mm/init_64.c | 12 ++++++------ arch/x86/Kconfig | 2 +- arch/x86/kernel/setup_percpu.c | 6 +++--- arch/x86/mm/init_32.c | 4 ++-- include/asm-generic/topology.h | 2 +- include/linux/memblock.h | 6 +++--- include/linux/mm.h | 4 ++-- include/linux/mmzone.h | 6 +++--- kernel/crash_core.c | 2 +- mm/Kconfig | 9 --------- mm/memblock.c | 8 ++++---- mm/memory.c | 3 +-- mm/page_alloc.c | 6 +++--- mm/sparse.c | 2 +- 36 files changed, 59 insertions(+), 69 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9f1d8566bbf9..d01a1545ab8f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1035,7 +1035,7 @@ config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" range 1 10 default "4" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 279252e3e0f7..da22a35e6f03 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -302,7 +302,7 @@ config NODES_SHIFT int "Max num nodes shift(3-10)" range 3 10 default "10" - depends on NEED_MULTIPLE_NODES + depends on NUMA help This option specifies the maximum number of nodes in your SSI system. MAX_NUMNODES will be 2^(This value). diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ed51970c08e7..4704a16c2e44 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2867,7 +2867,7 @@ config RANDOMIZE_BASE_MAX_OFFSET config NODES_SHIFT int default "6" - depends on NEED_MULTIPLE_NODES + depends on NUMA config HW_PERF_EVENTS bool "Enable hardware performance counter support for perf events" diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h index 7649ab45e80c..602a21aee9d4 100644 --- a/arch/mips/include/asm/mmzone.h +++ b/arch/mips/include/asm/mmzone.h @@ -8,7 +8,7 @@ #include -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA # include #endif diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 195ff4e9771f..96bc798c1ec1 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -239,7 +239,7 @@ static inline int pfn_valid(unsigned long pfn) /* pfn_valid is defined in linux/mmzone.h */ -#elif defined(CONFIG_NEED_MULTIPLE_NODES) +#elif defined(CONFIG_NUMA) #define pfn_valid(pfn) \ ({ \ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 97f6ca341448..19347dc6bbf8 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -394,7 +394,7 @@ void maar_init(void) } } -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; @@ -473,7 +473,7 @@ void __init mem_init(void) 0x80000000 - 4, KCORE_TEXT); #endif } -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#endif /* !CONFIG_NUMA */ void free_init_pages(const char *what, unsigned long begin, unsigned long end) { diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 088dd2afcfe4..14b132cf95e2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -671,7 +671,7 @@ config NODES_SHIFT int default "8" if PPC64 default "4" - depends on NEED_MULTIPLE_NODES + depends on NUMA config USE_PERCPU_NUMA_NODE_ID def_bool y diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index 6cda76b57c5d..4c6c6dbd182f 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -18,7 +18,7 @@ * flags field of the struct page */ -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA extern struct pglist_data *node_data[]; /* @@ -41,7 +41,7 @@ u64 memory_hotplug_max(void); #else #define memory_hotplug_max() memblock_end_of_DRAM() -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ #ifdef CONFIG_FA_DUMP #define __HAVE_ARCH_RESERVED_KERNEL_PAGES #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e42b85e4f1aa..a35fbf4d0bce 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -788,7 +788,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int node = early_cpu_to_node(cpu); void *ptr; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 2e05c783440a..a5209ea3859e 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1047,7 +1047,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA /* * numa_node_id() works after this. */ diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 56da5eb2b923..48525e8b5730 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -68,11 +68,11 @@ void machine_kexec_cleanup(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); #endif -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(contig_page_data); #endif #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index c3df3a8501d4..2ffcf540f08b 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,7 +13,7 @@ obj-y := fault.o mem.o pgtable.o mmap.o maccess.o \ obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ -obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index a6b36a40897a..c5e520c6f13b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -127,7 +127,7 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, } #endif -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA void __init mem_topology_setup(void) { max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; @@ -162,7 +162,7 @@ static int __init mark_nonram_nosave(void) return 0; } -#else /* CONFIG_NEED_MULTIPLE_NODES */ +#else /* CONFIG_NUMA */ static int __init mark_nonram_nosave(void) { return 0; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 18ec0f9bb8d5..15f9490a7aad 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -332,7 +332,7 @@ config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" range 1 10 default "2" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b4c7c34069f8..707afbcd81c2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -475,7 +475,7 @@ config NUMA config NODES_SHIFT int - depends on NEED_MULTIPLE_NODES + depends on NUMA default "1" config SCHED_SMT diff --git a/arch/sh/include/asm/mmzone.h b/arch/sh/include/asm/mmzone.h index 6552a088dc97..7b8dead2723d 100644 --- a/arch/sh/include/asm/mmzone.h +++ b/arch/sh/include/asm/mmzone.h @@ -2,7 +2,7 @@ #ifndef __ASM_SH_MMZONE_H #define __ASM_SH_MMZONE_H -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA #include extern struct pglist_data *node_data[]; @@ -31,7 +31,7 @@ static inline void setup_bootmem_node(int nid, unsigned long start, unsigned long end) { } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ /* Platform specific mem init */ void __init plat_mem_setup(void); diff --git a/arch/sh/kernel/topology.c b/arch/sh/kernel/topology.c index 7a989eed3b18..76af6db9daa2 100644 --- a/arch/sh/kernel/topology.c +++ b/arch/sh/kernel/topology.c @@ -46,7 +46,7 @@ static int __init topology_init(void) { int i, ret; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA for_each_online_node(i) register_one_node(i); #endif diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index d551a9cac41e..ba569cfb4368 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -120,7 +120,7 @@ config NODES_SHIFT int default "3" if CPU_SUBTYPE_SHX3 default "1" - depends on NEED_MULTIPLE_NODES + depends on NUMA config ARCH_FLATMEM_ENABLE def_bool y diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 168d7d4dd735..ce26c7f8950a 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -211,7 +211,7 @@ void __init allocate_pgdat(unsigned int nid) get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA NODE_DATA(nid) = memblock_alloc_try_nid( sizeof(struct pglist_data), SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 164a5254c91c..c72f52c704cd 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -265,7 +265,7 @@ config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" range 4 5 if SPARC64 default "5" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/sparc/include/asm/mmzone.h b/arch/sparc/include/asm/mmzone.h index 6543fb97a849..a236d8aa893a 100644 --- a/arch/sparc/include/asm/mmzone.h +++ b/arch/sparc/include/asm/mmzone.h @@ -2,7 +2,7 @@ #ifndef _SPARC64_MMZONE_H #define _SPARC64_MMZONE_H -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA #include @@ -13,6 +13,6 @@ extern struct pglist_data *node_data[]; extern int numa_cpu_lookup_table[]; extern cpumask_t numa_cpumask_lookup_table[]; -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ #endif /* _SPARC64_MMZONE_H */ diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index e38d8bf454e8..c89a5971fb0d 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1546,7 +1546,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int node = cpu_to_node(cpu); void *ptr; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index e454f179cf5d..06e938d03f3b 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -903,7 +903,7 @@ struct node_mem_mask { static struct node_mem_mask node_masks[MAX_NUMNODES]; static int num_node_masks; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA struct mdesc_mlgroup { u64 node; @@ -1059,7 +1059,7 @@ static void __init allocate_node_data(int nid) { struct pglist_data *p; unsigned long start_pfn, end_pfn; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data), SMP_CACHE_BYTES, nid); @@ -1080,7 +1080,7 @@ static void __init allocate_node_data(int nid) static void init_node_masks_nonnuma(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int i; #endif @@ -1090,7 +1090,7 @@ static void init_node_masks_nonnuma(void) node_masks[0].match = 0; num_node_masks = 1; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA for (i = 0; i < NR_CPUS; i++) numa_cpu_lookup_table[i] = 0; @@ -1098,7 +1098,7 @@ static void init_node_masks_nonnuma(void) #endif } -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(numa_cpu_lookup_table); @@ -2487,7 +2487,7 @@ int page_in_phys_avail(unsigned long paddr) static void __init register_page_bootmem_info(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int i; for_each_online_node(i) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0045e1b44190..5d523ff70fe7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1597,7 +1597,7 @@ config NODES_SHIFT default "10" if MAXSMP default "6" if X86_64 default "3" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0941d2f44f2a..78a32b956e81 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -66,7 +66,7 @@ EXPORT_SYMBOL(__per_cpu_offset); */ static bool __init pcpu_need_numa(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA pg_data_t *last = NULL; unsigned int cpu; @@ -101,7 +101,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, unsigned long align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int node = early_cpu_to_node(cpu); void *ptr; @@ -140,7 +140,7 @@ static void __init pcpu_fc_free(void *ptr, size_t size) static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 21ffb03f6c72..74b78840182d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -651,7 +651,7 @@ void __init find_low_pfn_range(void) highmem_pfn_init(); } -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA void __init initmem_init(void) { #ifdef CONFIG_HIGHMEM @@ -677,7 +677,7 @@ void __init initmem_init(void) setup_bootmem_allocator(); } -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#endif /* !CONFIG_NUMA */ void __init setup_bootmem_allocator(void) { diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index 5aa8705df87e..4dbe715be65b 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -45,7 +45,7 @@ #endif #ifndef cpumask_of_node - #ifdef CONFIG_NEED_MULTIPLE_NODES + #ifdef CONFIG_NUMA #define cpumask_of_node(node) ((node) == 0 ? cpu_online_mask : cpu_none_mask) #else #define cpumask_of_node(node) ((void)(node), cpu_online_mask) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 5984fff3f175..552309342c38 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -50,7 +50,7 @@ struct memblock_region { phys_addr_t base; phys_addr_t size; enum memblock_flags flags; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int nid; #endif }; @@ -347,7 +347,7 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); int memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA static inline void memblock_set_region_node(struct memblock_region *r, int nid) { r->nid = nid; @@ -366,7 +366,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r) { return 0; } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9bd21e6fad6a..07922ee1477e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -46,7 +46,7 @@ extern int sysctl_page_lock_unfairness; void init_mm_internals(void); -#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ +#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; static inline void set_max_mapnr(unsigned long limit) @@ -2460,7 +2460,7 @@ extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); extern unsigned long find_min_pfn_with_active_regions(void); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) { return 0; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6f9829562af2..4bd420ed3961 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1043,17 +1043,17 @@ extern int percpu_pagelist_high_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) #define NODE_MEM_MAP(nid) mem_map -#else /* CONFIG_NEED_MULTIPLE_NODES */ +#else /* CONFIG_NUMA */ #include -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#endif /* !CONFIG_NUMA */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 684a6061a13a..0a4780c047c9 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,7 +455,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmap_area_list); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(mem_map); VMCOREINFO_SYMBOL(contig_page_data); #endif diff --git a/mm/Kconfig b/mm/Kconfig index 218b96ccc84a..bffe4bd859f3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -59,15 +59,6 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM -# -# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's -# to represent different areas of memory. This variable allows -# those dependencies to exist individually. -# -config NEED_MULTIPLE_NODES - def_bool y - depends on NUMA - # # SPARSEMEM_EXTREME (which is the default) does some bootmem # allocations when sparse_init() is called. If this cannot diff --git a/mm/memblock.c b/mm/memblock.c index afaefa8fc6ab..123feef5259d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -92,7 +92,7 @@ * system initialization completes. */ -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA struct pglist_data __refdata contig_page_data; EXPORT_SYMBOL(contig_page_data); #endif @@ -607,7 +607,7 @@ repeat: * area, insert that portion. */ if (rbase > base) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA WARN_ON(nid != memblock_get_region_node(rgn)); #endif WARN_ON(flags != rgn->flags); @@ -1205,7 +1205,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int start_rgn, end_rgn; int i, ret; @@ -1849,7 +1849,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) size = rgn->size; end = base + size - 1; flags = rgn->flags; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); diff --git a/mm/memory.c b/mm/memory.c index 3dd6b2e73e1d..48c4576df898 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -90,8 +90,7 @@ #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif -#ifndef CONFIG_NEED_MULTIPLE_NODES -/* use the per-pgdat data instead for discontigmem - mbligh */ +#ifndef CONFIG_NUMA unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8926f3fd3bcf..c4069f9e3968 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1634,7 +1634,7 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); } -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA /* * During memory init memblocks map pfns to nids. The search is expensive and @@ -1684,7 +1684,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) return nid; } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) @@ -7438,7 +7438,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", __func__, pgdat->node_id, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA /* * With no DISCONTIG, the global mem_map is just set as node 0's */ diff --git a/mm/sparse.c b/mm/sparse.c index 55c18aff3e42..7272f7a1449d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -346,7 +346,7 @@ size_t mem_section_usage_size(void) static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) { -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA return __pa_symbol(pgdat); #else return __pa(pgdat); -- cgit From 43b02ba93b25b1caff7a3457fc5d005485e78da5 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:43:05 -0700 Subject: mm: replace CONFIG_FLAT_NODE_MEM_MAP with CONFIG_FLATMEM After removal of the DISCONTIGMEM memory model the FLAT_NODE_MEM_MAP configuration option is equivalent to FLATMEM. Drop CONFIG_FLAT_NODE_MEM_MAP and use CONFIG_FLATMEM instead. Link: https://lkml.kernel.org/r/20210608091316.3622-10-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- kernel/crash_core.c | 2 +- mm/Kconfig | 4 ---- mm/page_alloc.c | 6 +++--- mm/page_ext.c | 2 +- 5 files changed, 7 insertions(+), 11 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4bd420ed3961..578588d4afc9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -788,7 +788,7 @@ typedef struct pglist_data { struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ -#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ +#ifdef CONFIG_FLATMEM /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; @@ -878,7 +878,7 @@ typedef struct pglist_data { #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) -#ifdef CONFIG_FLAT_NODE_MEM_MAP +#ifdef CONFIG_FLATMEM #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) #else #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 0a4780c047c9..da449c1cdca7 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -484,7 +484,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP +#ifdef CONFIG_FLATMEM VMCOREINFO_OFFSET(pglist_data, node_mem_map); #endif VMCOREINFO_OFFSET(pglist_data, node_start_pfn); diff --git a/mm/Kconfig b/mm/Kconfig index bffe4bd859f3..ded98fb859ab 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -55,10 +55,6 @@ config FLATMEM def_bool y depends on !SPARSEMEM || FLATMEM_MANUAL -config FLAT_NODE_MEM_MAP - def_bool y - depends on !SPARSEMEM - # # SPARSEMEM_EXTREME (which is the default) does some bootmem # allocations when sparse_init() is called. If this cannot diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c4069f9e3968..0e441f1677f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6547,7 +6547,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -#if !defined(CONFIG_FLAT_NODE_MEM_MAP) +#if !defined(CONFIG_FLATMEM) /* * Only struct pages that correspond to ranges defined by memblock.memory * are zeroed and initialized by going through __init_single_page() during @@ -7403,7 +7403,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) } } -#ifdef CONFIG_FLAT_NODE_MEM_MAP +#ifdef CONFIG_FLATMEM static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; @@ -7451,7 +7451,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) } #else static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } -#endif /* CONFIG_FLAT_NODE_MEM_MAP */ +#endif /* CONFIG_FLATMEM */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void pgdat_set_deferred_range(pg_data_t *pgdat) diff --git a/mm/page_ext.c b/mm/page_ext.c index df6f74aac8e1..293b2685fc48 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -191,7 +191,7 @@ fail: panic("Out of memory"); } -#else /* CONFIG_FLAT_NODE_MEM_MAP */ +#else /* CONFIG_FLATMEM */ struct page_ext *lookup_page_ext(const struct page *page) { -- cgit From 44042b4498728f4376e84bae1ac8016d146d850b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:43:08 -0700 Subject: mm/page_alloc: allow high-order pages to be stored on the per-cpu lists The per-cpu page allocator (PCP) only stores order-0 pages. This means that all THP and "cheap" high-order allocations including SLUB contends on the zone->lock. This patch extends the PCP allocator to store THP and "cheap" high-order pages. Note that struct per_cpu_pages increases in size to 256 bytes (4 cache lines) on x86-64. Note that this is not necessarily a universal performance win because of how it is implemented. High-order pages can cause pcp->high to be exceeded prematurely for lower-orders so for example, a large number of THP pages being freed could release order-0 pages from the PCP lists. Hence, much depends on the allocation/free pattern as observed by a single CPU to determine if caching helps or hurts a particular workload. That said, basic performance testing passed. The following is a netperf UDP_STREAM test which hits the relevant patches as some of the network allocations are high-order. netperf-udp 5.13.0-rc2 5.13.0-rc2 mm-pcpburst-v3r4 mm-pcphighorder-v1r7 Hmean send-64 261.46 ( 0.00%) 266.30 * 1.85%* Hmean send-128 516.35 ( 0.00%) 536.78 * 3.96%* Hmean send-256 1014.13 ( 0.00%) 1034.63 * 2.02%* Hmean send-1024 3907.65 ( 0.00%) 4046.11 * 3.54%* Hmean send-2048 7492.93 ( 0.00%) 7754.85 * 3.50%* Hmean send-3312 11410.04 ( 0.00%) 11772.32 * 3.18%* Hmean send-4096 13521.95 ( 0.00%) 13912.34 * 2.89%* Hmean send-8192 21660.50 ( 0.00%) 22730.72 * 4.94%* Hmean send-16384 31902.32 ( 0.00%) 32637.50 * 2.30%* Functionally, a patch like this is necessary to make bulk allocation of high-order pages work with similar performance to order-0 bulk allocations. The bulk allocator is not updated in this series as it would have to be determined by bulk allocation users how they want to track the order of pages allocated with the bulk allocator. Link: https://lkml.kernel.org/r/20210611135753.GC30378@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Zi Yan Cc: Dave Hansen Cc: Michal Hocko Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 20 +++++- mm/internal.h | 2 +- mm/page_alloc.c | 169 +++++++++++++++++++++++++++++++++++-------------- mm/swap.c | 2 +- 4 files changed, 144 insertions(+), 49 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 578588d4afc9..265a32e1ff74 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -333,6 +333,24 @@ enum zone_watermarks { NR_WMARK }; +/* + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional + * for pageblock size for THP if configured. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define NR_PCP_THP 1 +#else +#define NR_PCP_THP 0 +#endif +#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP)) + +/* + * Shift to encode migratetype and order in the same integer, with order + * in the least significant bits. + */ +#define NR_PCP_ORDER_WIDTH 8 +#define NR_PCP_ORDER_MASK ((1<_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) @@ -349,7 +367,7 @@ struct per_cpu_pages { #endif /* Lists of pages, one per migrate type stored on the pcp-lists */ - struct list_head lists[MIGRATE_PCPTYPES]; + struct list_head lists[NR_PCP_LISTS]; }; struct per_cpu_zonestat { diff --git a/mm/internal.h b/mm/internal.h index 18e5fb4d225f..6ec2cea9926b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -203,7 +203,7 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; -extern void free_unref_page(struct page *page); +extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list); extern void zone_pcp_update(struct zone *zone, int cpu_online); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0e441f1677f3..34f097ecfe08 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -674,10 +674,53 @@ out: add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +static inline unsigned int order_to_pindex(int migratetype, int order) +{ + int base = order; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + VM_BUG_ON(order != pageblock_order); + base = PAGE_ALLOC_COSTLY_ORDER + 1; + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return (MIGRATE_PCPTYPES * base) + migratetype; +} + +static inline int pindex_to_order(unsigned int pindex) +{ + int order = pindex / MIGRATE_PCPTYPES; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + order = pageblock_order; + VM_BUG_ON(order != pageblock_order); + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return order; +} + +static inline bool pcp_allowed_order(unsigned int order) +{ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return true; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order == pageblock_order) + return true; +#endif + return false; +} + static inline void free_the_page(struct page *page, unsigned int order) { - if (order == 0) /* Via pcp? */ - free_unref_page(page); + if (pcp_allowed_order(order)) /* Via pcp? */ + free_unref_page(page, order); else __free_pages_ok(page, order, FPI_NONE); } @@ -700,7 +743,7 @@ static inline void free_the_page(struct page *page, unsigned int order) void free_compound_page(struct page *page) { mem_cgroup_uncharge(page); - __free_pages_ok(page, compound_order(page), FPI_NONE); + free_the_page(page, compound_order(page)); } void prep_compound_page(struct page *page, unsigned int order) @@ -1350,9 +1393,9 @@ static __always_inline bool free_pages_prepare(struct page *page, * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when * moved from pcp lists to free lists. */ -static bool free_pcp_prepare(struct page *page) +static bool free_pcp_prepare(struct page *page, unsigned int order) { - return free_pages_prepare(page, 0, true, FPI_NONE); + return free_pages_prepare(page, order, true, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1369,12 +1412,12 @@ static bool bulkfree_pcp_prepare(struct page *page) * debug_pagealloc enabled, they are checked also immediately when being freed * to the pcp lists. */ -static bool free_pcp_prepare(struct page *page) +static bool free_pcp_prepare(struct page *page, unsigned int order) { if (debug_pagealloc_enabled_static()) - return free_pages_prepare(page, 0, true, FPI_NONE); + return free_pages_prepare(page, order, true, FPI_NONE); else - return free_pages_prepare(page, 0, false, FPI_NONE); + return free_pages_prepare(page, order, false, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1406,8 +1449,10 @@ static inline void prefetch_buddy(struct page *page) static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) { - int migratetype = 0; + int pindex = 0; int batch_free = 0; + int nr_freed = 0; + unsigned int order; int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; struct page *page, *tmp; @@ -1418,7 +1463,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, * below while (list_empty(list)) loop. */ count = min(pcp->count, count); - while (count) { + while (count > 0) { struct list_head *list; /* @@ -1430,24 +1475,31 @@ static void free_pcppages_bulk(struct zone *zone, int count, */ do { batch_free++; - if (++migratetype == MIGRATE_PCPTYPES) - migratetype = 0; - list = &pcp->lists[migratetype]; + if (++pindex == NR_PCP_LISTS) + pindex = 0; + list = &pcp->lists[pindex]; } while (list_empty(list)); /* This is the only non-empty list. Free them all. */ - if (batch_free == MIGRATE_PCPTYPES) + if (batch_free == NR_PCP_LISTS) batch_free = count; + order = pindex_to_order(pindex); + BUILD_BUG_ON(MAX_ORDER >= (1<lru); - pcp->count--; + nr_freed += 1 << order; + count -= 1 << order; if (bulkfree_pcp_prepare(page)) continue; + /* Encode order with the migratetype */ + page->index <<= NR_PCP_ORDER_WIDTH; + page->index |= order; + list_add_tail(&page->lru, &head); /* @@ -1463,8 +1515,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, prefetch_buddy(page); prefetch_nr--; } - } while (--count && --batch_free && !list_empty(list)); + } while (count > 0 && --batch_free && !list_empty(list)); } + pcp->count -= nr_freed; /* * local_lock_irq held so equivalent to spin_lock_irqsave for @@ -1479,14 +1532,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, */ list_for_each_entry_safe(page, tmp, &head, lru) { int mt = get_pcppage_migratetype(page); + + /* mt has been encoded with the order (see above) */ + order = mt & NR_PCP_ORDER_MASK; + mt >>= NR_PCP_ORDER_WIDTH; + /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); - trace_mm_page_pcpu_drain(page, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); } spin_unlock(&zone->lock); } @@ -3263,11 +3321,12 @@ void mark_free_pages(struct zone *zone) } #endif /* CONFIG_PM */ -static bool free_unref_page_prepare(struct page *page, unsigned long pfn) +static bool free_unref_page_prepare(struct page *page, unsigned long pfn, + unsigned int order) { int migratetype; - if (!free_pcp_prepare(page)) + if (!free_pcp_prepare(page, order)) return false; migratetype = get_pfnblock_migratetype(page, pfn); @@ -3317,16 +3376,18 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) } static void free_unref_page_commit(struct page *page, unsigned long pfn, - int migratetype) + int migratetype, unsigned int order) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; int high; + int pindex; __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); - list_add(&page->lru, &pcp->lists[migratetype]); - pcp->count++; + pindex = order_to_pindex(migratetype, order); + list_add(&page->lru, &pcp->lists[pindex]); + pcp->count += 1 << order; high = nr_pcp_high(pcp, zone); if (pcp->count >= high) { int batch = READ_ONCE(pcp->batch); @@ -3336,15 +3397,15 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn, } /* - * Free a 0-order page + * Free a pcp page */ -void free_unref_page(struct page *page) +void free_unref_page(struct page *page, unsigned int order) { unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; - if (!free_unref_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn, order)) return; /* @@ -3357,14 +3418,14 @@ void free_unref_page(struct page *page) migratetype = get_pcppage_migratetype(page); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; } local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn, migratetype); + free_unref_page_commit(page, pfn, migratetype, order); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3381,7 +3442,7 @@ void free_unref_page_list(struct list_head *list) /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn, 0)) list_del(&page->lru); /* @@ -3413,7 +3474,7 @@ void free_unref_page_list(struct list_head *list) set_page_private(page, 0); migratetype = get_pcppage_migratetype(page); trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn, migratetype); + free_unref_page_commit(page, pfn, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get @@ -3549,7 +3610,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, /* Remove page from the per-cpu list, caller must protect the list */ static inline -struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, + int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) @@ -3558,16 +3620,30 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, do { if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - READ_ONCE(pcp->batch), list, + int batch = READ_ONCE(pcp->batch); + int alloced; + + /* + * Scale batch relative to order if batch implies + * free pages can be stored on the PCP. Batch can + * be 1 for small zones or for boot pagesets which + * should never store free pages as the pages may + * belong to arbitrary zones. + */ + if (batch > 1) + batch = max(batch >> order, 2); + alloced = rmqueue_bulk(zone, order, + batch, list, migratetype, alloc_flags); + + pcp->count += alloced << order; if (unlikely(list_empty(list))) return NULL; } page = list_first_entry(list, struct page, lru); list_del(&page->lru); - pcp->count--; + pcp->count -= 1 << order; } while (check_new_pcp(page)); return page; @@ -3575,8 +3651,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, - struct zone *zone, gfp_t gfp_flags, - int migratetype, unsigned int alloc_flags) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype, + unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3592,8 +3669,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, */ pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; - list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); + list = &pcp->lists[order_to_pindex(migratetype, order)]; + page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); @@ -3614,15 +3691,15 @@ struct page *rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; - if (likely(order == 0)) { + if (likely(pcp_allowed_order(order))) { /* * MIGRATE_MOVABLE pcplist could have the pages on CMA area and * we need to skip it when CMA area isn't allowed. */ if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || migratetype != MIGRATE_MOVABLE) { - page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, - migratetype, alloc_flags); + page = rmqueue_pcplist(preferred_zone, zone, order, + gfp_flags, migratetype, alloc_flags); goto out; } } @@ -5201,7 +5278,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Attempt the batch allocation */ local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); - pcp_list = &pcp->lists[ac.migratetype]; + pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; while (nr_populated < nr_pages) { @@ -5211,7 +5288,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, continue; } - page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, + page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, pcp, pcp_list); if (unlikely(!page)) { /* Try and get at least one page */ @@ -6778,13 +6855,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) { - int migratetype; + int pindex; memset(pcp, 0, sizeof(*pcp)); memset(pzstats, 0, sizeof(*pzstats)); - for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) - INIT_LIST_HEAD(&pcp->lists[migratetype]); + for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) + INIT_LIST_HEAD(&pcp->lists[pindex]); /* * Set batch and high values safe for a boot pageset. A true percpu diff --git a/mm/swap.c b/mm/swap.c index 18cc9e63515b..6c11db780467 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -95,7 +95,7 @@ static void __put_single_page(struct page *page) { __page_cache_release(page); mem_cgroup_uncharge(page); - free_unref_page(page); + free_unref_page(page, 0); } static void __put_compound_page(struct page *page) -- cgit From 203c06eef579c670b8eb3a24108b9837bf9b7737 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 28 Jun 2021 19:43:11 -0700 Subject: mm/page_alloc: split pcp->high across all online CPUs for cpuless nodes Dave Hansen reported the following about Feng Tang's tests on a machine with persistent memory onlined as a DRAM-like device. Feng Tang tossed these on a "Cascade Lake" system with 96 threads and ~512G of persistent memory and 128G of DRAM. The PMEM is in "volatile use" mode and being managed via the buddy just like the normal RAM. The PMEM zones are big ones: present 65011712 = 248 G high 134595 = 525 M The PMEM nodes, of course, don't have any CPUs in them. With your series, the pcp->high value per-cpu is 69584 pages or about 270MB per CPU. Scaled up by the 96 CPU threads, that's ~26GB of worst-case memory in the pcps per zone, or roughly 10% of the size of the zone. This should not cause a problem as such although it could trigger reclaim due to pages being stored on per-cpu lists for CPUs remote to a node. It is not possible to treat cpuless nodes exactly the same as normal nodes but the worst-case scenario can be mitigated by splitting pcp->high across all online CPUs for cpuless memory nodes. Link: https://lkml.kernel.org/r/20210616110743.GK30378@techsingularity.net Suggested-by: Dave Hansen Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Dave Hansen Cc: Hillf Danton Cc: Michal Hocko Cc: "Tang, Feng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 34f097ecfe08..db00ee8d79d2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6790,7 +6790,7 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) { #ifdef CONFIG_MMU int high; - int nr_local_cpus; + int nr_split_cpus; unsigned long total_pages; if (!percpu_pagelist_high_fraction) { @@ -6813,10 +6813,14 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) * Split the high value across all online CPUs local to the zone. Note * that early in boot that CPUs may not be online yet and that during * CPU hotplug that the cpumask is not yet updated when a CPU is being - * onlined. - */ - nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online; - high = total_pages / nr_local_cpus; + * onlined. For memory nodes that have no CPUs, split pcp->high across + * all online CPUs to mitigate the risk that reclaim is triggered + * prematurely due to pages stored on pcp lists. + */ + nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online; + if (!nr_split_cpus) + nr_split_cpus = num_online_cpus(); + high = total_pages / nr_split_cpus; /* * Ensure high is at least batch*4. The multiple is based on the -- cgit From a3f5d80ea401ac857f2910e28b15f35b2cf902f4 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 28 Jun 2021 19:43:14 -0700 Subject: mm,hwpoison: send SIGBUS with error virutal address Now an action required MCE in already hwpoisoned address surely sends a SIGBUS to current process, but the SIGBUS doesn't convey error virtual address. That's not optimal for hwpoison-aware applications. To fix the issue, make memory_failure() call kill_accessing_process(), that does pagetable walk to find the error virtual address. It could find multiple virtual addresses for the same error page, and it seems hard to tell which virtual address is correct one. But that's rare and sending incorrect virtual address could be better than no address. So let's report the first found virtual address for now. [naoya.horiguchi@nec.com: fix walk_page_range() return] Link: https://lkml.kernel.org/r/20210603051055.GA244241@hori.linux.bs1.fc.nec.co.jp Link: https://lkml.kernel.org/r/20210521030156.2612074-4-nao.horiguchi@gmail.com Signed-off-by: Naoya Horiguchi Cc: Tony Luck Cc: Aili Yao Cc: Oscar Salvador Cc: David Hildenbrand Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Jue Wang Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mce/core.c | 13 +++- include/linux/swapops.h | 5 ++ mm/memory-failure.c | 150 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 165 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index bf7fe87a7e88..22791aadc085 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1257,19 +1257,28 @@ static void kill_me_maybe(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); int flags = MF_ACTION_REQUIRED; + int ret; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); if (!p->mce_ripv) flags |= MF_MUST_KILL; - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && - !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); + if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; } + /* + * -EHWPOISON from memory_failure() means that it already sent SIGBUS + * to the current process with the proper error info, so no need to + * send SIGBUS here again. + */ + if (ret == -EHWPOISON) + return; + if (p->mce_vaddr != (void __user *)-1l) { force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); } else { diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6430a94c6981..5907205c712c 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -330,6 +330,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } +static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry) +{ + return swp_offset(entry); +} + static inline void num_poisoned_pages_inc(void) { atomic_long_inc(&num_poisoned_pages); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6f5f78885ab4..4d151ce3e50d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -554,6 +555,148 @@ static void collect_procs(struct page *page, struct list_head *tokill, collect_procs_file(page, tokill, force_early); } +struct hwp_walk { + struct to_kill tk; + unsigned long pfn; + int flags; +}; + +static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift) +{ + tk->addr = addr; + tk->size_shift = shift; +} + +static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, + unsigned long poisoned_pfn, struct to_kill *tk) +{ + unsigned long pfn = 0; + + if (pte_present(pte)) { + pfn = pte_pfn(pte); + } else { + swp_entry_t swp = pte_to_swp_entry(pte); + + if (is_hwpoison_entry(swp)) + pfn = hwpoison_entry_to_pfn(swp); + } + + if (!pfn || pfn != poisoned_pfn) + return 0; + + set_to_kill(tk, addr, shift); + return 1; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + pmd_t pmd = *pmdp; + unsigned long pfn; + unsigned long hwpoison_vaddr; + + if (!pmd_present(pmd)) + return 0; + pfn = pmd_pfn(pmd); + if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) { + hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT); + set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT); + return 1; + } + return 0; +} +#else +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + return 0; +} +#endif + +static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + int ret = 0; + pte_t *ptep; + spinlock_t *ptl; + + ptl = pmd_trans_huge_lock(pmdp, walk->vma); + if (ptl) { + ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp); + spin_unlock(ptl); + goto out; + } + + if (pmd_trans_unstable(pmdp)) + goto out; + + ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl); + for (; addr != end; ptep++, addr += PAGE_SIZE) { + ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + hwp->pfn, &hwp->tk); + if (ret == 1) + break; + } + pte_unmap_unlock(ptep - 1, ptl); +out: + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + pte_t pte = huge_ptep_get(ptep); + struct hstate *h = hstate_vma(walk->vma); + + return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), + hwp->pfn, &hwp->tk); +} +#else +#define hwpoison_hugetlb_range NULL +#endif + +static struct mm_walk_ops hwp_walk_ops = { + .pmd_entry = hwpoison_pte_range, + .hugetlb_entry = hwpoison_hugetlb_range, +}; + +/* + * Sends SIGBUS to the current process with error info. + * + * This function is intended to handle "Action Required" MCEs on already + * hardware poisoned pages. They could happen, for example, when + * memory_failure() failed to unmap the error page at the first call, or + * when multiple local machine checks happened on different CPUs. + * + * MCE handler currently has no easy access to the error virtual address, + * so this function walks page table to find it. The returned virtual address + * is proper in most cases, but it could be wrong when the application + * process has multiple entries mapping the error page. + */ +static int kill_accessing_process(struct task_struct *p, unsigned long pfn, + int flags) +{ + int ret; + struct hwp_walk priv = { + .pfn = pfn, + }; + priv.tk.tsk = p; + + mmap_read_lock(p->mm); + ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, + (void *)&priv); + if (ret == 1 && priv.tk.addr) + kill_proc(&priv.tk, pfn, flags); + mmap_read_unlock(p->mm); + return ret ? -EFAULT : -EHWPOISON; +} + static const char *action_name[] = { [MF_IGNORED] = "Ignored", [MF_FAILED] = "Failed", @@ -1267,7 +1410,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (TestSetPageHWPoison(head)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); - return -EHWPOISON; + res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, page_to_pfn(head), flags); + return res; } num_poisoned_pages_inc(); @@ -1476,6 +1622,8 @@ try_again: pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, pfn, flags); goto unlock_mutex; } -- cgit From 0ed950d1f28142ccd9a9453c60df87853530d778 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 28 Jun 2021 19:43:17 -0700 Subject: mm,hwpoison: make get_hwpoison_page() call get_any_page() __get_hwpoison_page() could fail to grab refcount by some race condition, so it's helpful if we can handle it by retrying. We already have retry logic, so make get_hwpoison_page() call get_any_page() when called from memory_failure(). As a result, get_hwpoison_page() can return negative values (i.e. error code), so some callers are also changed to handle error cases. soft_offline_page() does nothing for -EBUSY because that's enough and users in userspace can easily handle it. unpoison_memory() is also unchanged because it's broken and need thorough fixes (will be done later). Link: https://lkml.kernel.org/r/20210603233632.2964832-3-nao.horiguchi@gmail.com Signed-off-by: Naoya Horiguchi Cc: Oscar Salvador Cc: Muchun Song Cc: Mike Kravetz Cc: Michal Hocko Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 + mm/memory-failure.c | 194 +++++++++++++++++++++++++++++----------------------- 2 files changed, 111 insertions(+), 85 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5ba5a0da6d57..103f1187043f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5938,6 +5938,8 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) *hugetlb = true; if (HPageFreed(page) || HPageMigratable(page)) ret = get_page_unless_zero(page); + else + ret = -EBUSY; } spin_unlock_irq(&hugetlb_lock); return ret; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4d151ce3e50d..e5a1531f7f4e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1117,13 +1117,6 @@ static inline bool HWPoisonHandlable(struct page *page) return PageLRU(page) || __PageMovable(page); } -/** - * __get_hwpoison_page() - Get refcount for memory error handling: - * @page: raw error page (hit by memory error) - * - * Return: return 0 if failed to grab the refcount, otherwise true (some - * non-zero value.) - */ static int __get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); @@ -1168,15 +1161,6 @@ static int __get_hwpoison_page(struct page *page) return 0; } -/* - * Safely get reference count of an arbitrary page. - * - * Returns 0 for a free page, 1 for an in-use page, - * -EIO for a page-type we cannot handle and -EBUSY if we raced with an - * allocation. - * We only incremented refcount in case the page was already in-use and it - * is a known type we can handle. - */ static int get_any_page(struct page *p, unsigned long flags) { int ret = 0, pass = 0; @@ -1186,50 +1170,77 @@ static int get_any_page(struct page *p, unsigned long flags) count_increased = true; try_again: - if (!count_increased && !__get_hwpoison_page(p)) { - if (page_count(p)) { - /* We raced with an allocation, retry. */ - if (pass++ < 3) - goto try_again; - ret = -EBUSY; - } else if (!PageHuge(p) && !is_free_buddy_page(p)) { - /* We raced with put_page, retry. */ + if (!count_increased) { + ret = __get_hwpoison_page(p); + if (!ret) { + if (page_count(p)) { + /* We raced with an allocation, retry. */ + if (pass++ < 3) + goto try_again; + ret = -EBUSY; + } else if (!PageHuge(p) && !is_free_buddy_page(p)) { + /* We raced with put_page, retry. */ + if (pass++ < 3) + goto try_again; + ret = -EIO; + } + goto out; + } else if (ret == -EBUSY) { + /* We raced with freeing huge page to buddy, retry. */ if (pass++ < 3) goto try_again; - ret = -EIO; + goto out; } + } + + if (PageHuge(p) || HWPoisonHandlable(p)) { + ret = 1; } else { - if (PageHuge(p) || HWPoisonHandlable(p)) { - ret = 1; - } else { - /* - * A page we cannot handle. Check whether we can turn - * it into something we can handle. - */ - if (pass++ < 3) { - put_page(p); - shake_page(p, 1); - count_increased = false; - goto try_again; - } + /* + * A page we cannot handle. Check whether we can turn + * it into something we can handle. + */ + if (pass++ < 3) { put_page(p); - ret = -EIO; + shake_page(p, 1); + count_increased = false; + goto try_again; } + put_page(p); + ret = -EIO; } - +out: return ret; } -static int get_hwpoison_page(struct page *p, unsigned long flags, - enum mf_flags ctxt) +/** + * get_hwpoison_page() - Get refcount for memory error handling + * @p: Raw error page (hit by memory error) + * @flags: Flags controlling behavior of error handling + * + * get_hwpoison_page() takes a page refcount of an error page to handle memory + * error on it, after checking that the error page is in a well-defined state + * (defined as a page-type we can successfully handle the memor error on it, + * such as LRU page and hugetlb page). + * + * Memory error handling could be triggered at any time on any type of page, + * so it's prone to race with typical memory management lifecycle (like + * allocation and free). So to avoid such races, get_hwpoison_page() takes + * extra care for the error page's state (as done in __get_hwpoison_page()), + * and has some retry logic in get_any_page(). + * + * Return: 0 on failure, + * 1 on success for in-use pages in a well-defined state, + * -EIO for pages on which we can not handle memory errors, + * -EBUSY when get_hwpoison_page() has raced with page lifecycle + * operations like allocation and free. + */ +static int get_hwpoison_page(struct page *p, unsigned long flags) { int ret; zone_pcp_disable(page_zone(p)); - if (ctxt == MF_SOFT_OFFLINE) - ret = get_any_page(p, flags); - else - ret = __get_hwpoison_page(p); + ret = get_any_page(p, flags); zone_pcp_enable(page_zone(p)); return ret; @@ -1418,27 +1429,33 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) num_poisoned_pages_inc(); - if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p, flags, 0)) { - /* - * Check "filter hit" and "race with other subpage." - */ - lock_page(head); - if (PageHWPoison(head)) { - if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != head && TestSetPageHWPoison(head))) { - num_poisoned_pages_dec(); - unlock_page(head); - return 0; + if (!(flags & MF_COUNT_INCREASED)) { + res = get_hwpoison_page(p, flags); + if (!res) { + /* + * Check "filter hit" and "race with other subpage." + */ + lock_page(head); + if (PageHWPoison(head)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != head && TestSetPageHWPoison(head))) { + num_poisoned_pages_dec(); + unlock_page(head); + return 0; + } } + unlock_page(head); + res = MF_FAILED; + if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) { + page_ref_inc(p); + res = MF_RECOVERED; + } + action_result(pfn, MF_MSG_FREE_HUGE, res); + return res == MF_RECOVERED ? 0 : -EBUSY; + } else if (res < 0) { + action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); + return -EBUSY; } - unlock_page(head); - res = MF_FAILED; - if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) { - page_ref_inc(p); - res = MF_RECOVERED; - } - action_result(pfn, MF_MSG_FREE_HUGE, res); - return res == MF_RECOVERED ? 0 : -EBUSY; } lock_page(head); @@ -1641,28 +1658,35 @@ try_again: * In fact it's dangerous to directly bump up page count from 0, * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. */ - if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p, flags, 0)) { - if (is_free_buddy_page(p)) { - if (take_page_off_buddy(p)) { - page_ref_inc(p); - res = MF_RECOVERED; - } else { - /* We lost the race, try again */ - if (retry) { - ClearPageHWPoison(p); - num_poisoned_pages_dec(); - retry = false; - goto try_again; + if (!(flags & MF_COUNT_INCREASED)) { + res = get_hwpoison_page(p, flags); + if (!res) { + if (is_free_buddy_page(p)) { + if (take_page_off_buddy(p)) { + page_ref_inc(p); + res = MF_RECOVERED; + } else { + /* We lost the race, try again */ + if (retry) { + ClearPageHWPoison(p); + num_poisoned_pages_dec(); + retry = false; + goto try_again; + } + res = MF_FAILED; } - res = MF_FAILED; + action_result(pfn, MF_MSG_BUDDY, res); + res = res == MF_RECOVERED ? 0 : -EBUSY; + } else { + action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); + res = -EBUSY; } - action_result(pfn, MF_MSG_BUDDY, res); - res = res == MF_RECOVERED ? 0 : -EBUSY; - } else { - action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); + goto unlock_mutex; + } else if (res < 0) { + action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); res = -EBUSY; + goto unlock_mutex; } - goto unlock_mutex; } if (PageTransHuge(hpage)) { @@ -1940,7 +1964,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - if (!get_hwpoison_page(p, flags, 0)) { + if (!get_hwpoison_page(p, flags)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", @@ -2156,7 +2180,7 @@ int soft_offline_page(unsigned long pfn, int flags) retry: get_online_mems(); - ret = get_hwpoison_page(page, flags, MF_SOFT_OFFLINE); + ret = get_hwpoison_page(page, flags); put_online_mems(); if (ret > 0) { -- cgit