diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 459 |
1 files changed, 335 insertions, 124 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2f19bf948db..aaa1655cf682 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,7 +72,6 @@ #include <linux/padata.h> #include <linux/khugepaged.h> #include <linux/buffer_head.h> - #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -108,6 +107,17 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +/* + * Don't poison memory with KASAN (only for the tag-based modes). + * During boot, all non-reserved memblock memory is exposed to page_alloc. + * Poisoning all that memory lengthens boot time, especially on systems with + * large amount of RAM. This flag is used to skip that poisoning. + * This is only done for the tag-based KASAN modes, as those are able to + * detect memory corruptions with the memory tags assigned by default. + * All memory allocated normally after boot gets poisoned as usual. + */ +#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -384,10 +394,15 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages); * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ -static inline void kasan_free_nondeferred_pages(struct page *page, int order) +static inline void kasan_free_nondeferred_pages(struct page *page, int order, + bool init, fpi_t fpi_flags) { - if (!static_branch_unlikely(&deferred_pages)) - kasan_free_pages(page, order); + if (static_branch_unlikely(&deferred_pages)) + return; + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) + return; + kasan_free_pages(page, order, init); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -438,7 +453,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o) +static inline void kasan_free_nondeferred_pages(struct page *page, int order, + bool init, fpi_t fpi_flags) +{ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) + return; + kasan_free_pages(page, order, init); +} static inline bool early_page_uninitialised(unsigned long pfn) { @@ -764,32 +786,36 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, */ void init_mem_debugging_and_hardening(void) { + bool page_poisoning_requested = false; + +#ifdef CONFIG_PAGE_POISONING + /* + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) { + static_branch_enable(&_page_poisoning_enabled); + page_poisoning_requested = true; + } +#endif + if (_init_on_alloc_enabled_early) { - if (page_poisoning_enabled()) + if (page_poisoning_requested) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " "will take precedence over init_on_alloc\n"); else static_branch_enable(&init_on_alloc); } if (_init_on_free_enabled_early) { - if (page_poisoning_enabled()) + if (page_poisoning_requested) pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " "will take precedence over init_on_free\n"); else static_branch_enable(&init_on_free); } -#ifdef CONFIG_PAGE_POISONING - /* - * Page poisoning is debug page alloc for some arches. If - * either of those options are enabled, enable poisoning. - */ - if (page_poisoning_enabled() || - (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && - debug_pagealloc_enabled())) - static_branch_enable(&_page_poisoning_enabled); -#endif - #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; @@ -867,7 +893,7 @@ compaction_capture(struct capture_control *capc, struct page *page, return false; /* - * Do not let lower order allocations polluate a movable pageblock. + * Do not let lower order allocations pollute a movable pageblock. * This might let an unmovable request use a reclaimable pageblock * and vice-versa but no more than normal fallback logic which can * have trouble finding a high-order free page. @@ -1103,7 +1129,7 @@ static inline bool page_expected_state(struct page *page, if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page_memcg(page) | + page->memcg_data | #endif (page->flags & check_flags))) return false; @@ -1128,7 +1154,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page_memcg(page))) + if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif return bad_reason; @@ -1216,9 +1242,10 @@ static void kernel_init_free_pages(struct page *page, int numpages) } static __always_inline bool free_pages_prepare(struct page *page, - unsigned int order, bool check_free) + unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; + bool init; VM_BUG_ON_PAGE(PageTail(page), page); @@ -1276,16 +1303,21 @@ static __always_inline bool free_pages_prepare(struct page *page, debug_check_no_obj_freed(page_address(page), PAGE_SIZE << order); } - if (want_init_on_free()) - kernel_init_free_pages(page, 1 << order); kernel_poison_pages(page, 1 << order); /* + * As memory initialization might be integrated into KASAN, + * kasan_free_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - kasan_free_nondeferred_pages(page, order); + init = want_init_on_free(); + if (init && !kasan_has_integrated_init()) + kernel_init_free_pages(page, 1 << order); + kasan_free_nondeferred_pages(page, order, init, fpi_flags); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -1307,7 +1339,7 @@ static __always_inline bool free_pages_prepare(struct page *page, */ static bool free_pcp_prepare(struct page *page) { - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, 0, true, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1327,9 +1359,9 @@ static bool bulkfree_pcp_prepare(struct page *page) static bool free_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, 0, true, FPI_NONE); else - return free_pages_prepare(page, 0, false); + return free_pages_prepare(page, 0, false, FPI_NONE); } static bool bulkfree_pcp_prepare(struct page *page) @@ -1537,7 +1569,7 @@ static void __free_pages_ok(struct page *page, unsigned int order, int migratetype; unsigned long pfn = page_to_pfn(page); - if (!free_pages_prepare(page, order, true)) + if (!free_pages_prepare(page, order, true, fpi_flags)) return; migratetype = get_pfnblock_migratetype(page, pfn); @@ -1574,7 +1606,7 @@ void __free_pages_core(struct page *page, unsigned int order) * Bypass PCP and place fresh pages right to the tail, primarily * relevant for memory onlining. */ - __free_pages_ok(page, order, FPI_TO_TAIL); + __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); } #ifdef CONFIG_NEED_MULTIPLE_NODES @@ -2292,17 +2324,32 @@ static bool check_new_pages(struct page *page, unsigned int order) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init; + set_page_private(page, 0); set_page_refcounted(page); arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); - kasan_alloc_pages(page, order); + + /* + * Page unpoisoning must happen before memory initialization. + * Otherwise, the poison pattern will be overwritten for __GFP_ZERO + * allocations and the page unpoisoning code will complain. + */ kernel_unpoison_pages(page, 1 << order); - set_page_owner(page, order, gfp_flags); - if (!want_init_on_free() && want_init_on_alloc(gfp_flags)) + /* + * As memory initialization might be integrated into KASAN, + * kasan_alloc_pages and kernel_init_free_pages must be + * kept together to avoid discrepancies in behavior. + */ + init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + kasan_alloc_pages(page, order, init); + if (init && !kasan_has_integrated_init()) kernel_init_free_pages(page, 1 << order); + + set_page_owner(page, order, gfp_flags); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -2386,19 +2433,21 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * boundary. If alignment is required, use move_freepages_block() */ static int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, + unsigned long start_pfn, unsigned long end_pfn, int migratetype, int *num_movable) { struct page *page; + unsigned long pfn; unsigned int order; int pages_moved = 0; - for (page = start_page; page <= end_page;) { - if (!pfn_valid_within(page_to_pfn(page))) { - page++; + for (pfn = start_pfn; pfn <= end_pfn;) { + if (!pfn_valid_within(pfn)) { + pfn++; continue; } + page = pfn_to_page(pfn); if (!PageBuddy(page)) { /* * We assume that pages that could be isolated for @@ -2408,8 +2457,7 @@ static int move_freepages(struct zone *zone, if (num_movable && (PageLRU(page) || __PageMovable(page))) (*num_movable)++; - - page++; + pfn++; continue; } @@ -2419,7 +2467,7 @@ static int move_freepages(struct zone *zone, order = buddy_order(page); move_to_free_list(page, zone, order, migratetype); - page += 1 << order; + pfn += 1 << order; pages_moved += 1 << order; } @@ -2429,25 +2477,22 @@ static int move_freepages(struct zone *zone, int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable) { - unsigned long start_pfn, end_pfn; - struct page *start_page, *end_page; + unsigned long start_pfn, end_pfn, pfn; if (num_movable) *num_movable = 0; - start_pfn = page_to_pfn(page); - start_pfn = start_pfn & ~(pageblock_nr_pages-1); - start_page = pfn_to_page(start_pfn); - end_page = start_page + pageblock_nr_pages - 1; + pfn = page_to_pfn(page); + start_pfn = pfn & ~(pageblock_nr_pages - 1); end_pfn = start_pfn + pageblock_nr_pages - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) - start_page = page; + start_pfn = pfn; if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype, + return move_freepages(zone, start_pfn, end_pfn, migratetype, num_movable); } @@ -2731,7 +2776,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, /* * In page freeing path, migratetype change is racy so * we can counter several free pages in a pageblock - * in this loop althoug we changed the pageblock type + * in this loop although we changed the pageblock type * from highatomic to ac->migratetype. So we should * adjust the count once. */ @@ -2908,7 +2953,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { - int i, alloced = 0; + int i, allocated = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2931,7 +2976,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages are ordered properly. */ list_add_tail(&page->lru, list); - alloced++; + allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); @@ -2940,12 +2985,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, /* * i pages were removed from the buddy list even if some leak due * to check_pcp_refill failing so adjust NR_FREE_PAGES based - * on i. Do not confuse with 'alloced' which is the number of + * on i. Do not confuse with 'allocated' which is the number of * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return alloced; + return allocated; } #ifdef CONFIG_NUMA @@ -3035,7 +3080,7 @@ static void drain_local_pages_wq(struct work_struct *work) * drain_all_pages doesn't use proper cpu hotplug protection so * we can race with cpu offline when the WQ can move this from * a cpu pinned worker to an unbound one. We can operate on a different - * cpu which is allright but we also have to make sure to not move to + * cpu which is alright but we also have to make sure to not move to * a different one. */ preempt_disable(); @@ -3415,7 +3460,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) } /* Remove page from the per-cpu list, caller must protect the list */ -static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +static inline +struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) @@ -3813,16 +3859,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) return alloc_flags; } -static inline unsigned int current_alloc_flags(gfp_t gfp_mask, - unsigned int alloc_flags) +/* Must be called after current_gfp_context() which can change gfp_mask */ +static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, + unsigned int alloc_flags) { #ifdef CONFIG_CMA - unsigned int pflags = current->flags; - - if (!(pflags & PF_MEMALLOC_NOCMA) && - gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; - #endif return alloc_flags; } @@ -3922,7 +3965,7 @@ retry: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (node_reclaim_mode == 0 || + if (!node_reclaim_enabled() || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; @@ -4130,7 +4173,7 @@ out: } /* - * Maximum number of compaction retries wit a progress before OOM + * Maximum number of compaction retries with a progress before OOM * killer is consider as the only way to move forward. */ #define MAX_COMPACT_RETRIES 16 @@ -4158,6 +4201,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); + if (*compact_result == COMPACT_SKIPPED) + return NULL; /* * At least in one zone compaction wasn't deferred or skipped, so let's * count a compaction stall @@ -4478,7 +4523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - alloc_flags = current_alloc_flags(gfp_mask, alloc_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); return alloc_flags; } @@ -4780,7 +4825,7 @@ retry: reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = current_alloc_flags(gfp_mask, reserve_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); /* * Reset the nodemask and zonelist iterators if memory policies can be @@ -4921,7 +4966,7 @@ got_pg: static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask, - struct alloc_context *ac, gfp_t *alloc_mask, + struct alloc_context *ac, gfp_t *alloc_gfp, unsigned int *alloc_flags) { ac->highest_zoneidx = gfp_zone(gfp_mask); @@ -4930,7 +4975,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, ac->migratetype = gfp_migratetype(gfp_mask); if (cpusets_enabled()) { - *alloc_mask |= __GFP_HARDWALL; + *alloc_gfp |= __GFP_HARDWALL; /* * When we are in the interrupt context, it is irrelevant * to the current task context. It means that any node ok. @@ -4949,7 +4994,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); + *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4966,15 +5011,160 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array + * @gfp: GFP flags for the allocation + * @preferred_nid: The preferred NUMA node ID to allocate from + * @nodemask: Set of nodes to allocate from, may be NULL + * @nr_pages: The number of pages desired on the list or array + * @page_list: Optional list to store the allocated pages + * @page_array: Optional array to store the pages + * + * This is a batched version of the page allocator that attempts to + * allocate nr_pages quickly. Pages are added to page_list if page_list + * is not NULL, otherwise it is assumed that the page_array is valid. + * + * For lists, nr_pages is the number of pages that should be allocated. + * + * For arrays, only NULL elements are populated with pages and nr_pages + * is the maximum number of pages that will be stored in the array. + * + * Returns the number of pages on the list or array. + */ +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list, + struct page **page_array) +{ + struct page *page; + unsigned long flags; + struct zone *zone; + struct zoneref *z; + struct per_cpu_pages *pcp; + struct list_head *pcp_list; + struct alloc_context ac; + gfp_t alloc_gfp; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + int nr_populated = 0; + + if (unlikely(nr_pages <= 0)) + return 0; + + /* + * Skip populated array elements to determine if any pages need + * to be allocated before disabling IRQs. + */ + while (page_array && page_array[nr_populated] && nr_populated < nr_pages) + nr_populated++; + + /* Use the single page allocator for one page. */ + if (nr_pages - nr_populated == 1) + goto failed; + + /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ + gfp &= gfp_allowed_mask; + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) + return 0; + gfp = alloc_gfp; + + /* Find an allowed local zone that meets the low watermark. */ + for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { + unsigned long mark; + + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp)) { + continue; + } + + if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && + zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { + goto failed; + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; + if (zone_watermark_fast(zone, 0, mark, + zonelist_zone_idx(ac.preferred_zoneref), + alloc_flags, gfp)) { + break; + } + } + + /* + * If there are no allowed local zones that meets the watermarks then + * try to allocate a single page and reclaim if necessary. + */ + if (unlikely(!zone)) + goto failed; + + /* Attempt the batch allocation */ + local_irq_save(flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + pcp_list = &pcp->lists[ac.migratetype]; + + while (nr_populated < nr_pages) { + + /* Skip existing pages */ + if (page_array && page_array[nr_populated]) { + nr_populated++; + continue; + } + + page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, + pcp, pcp_list); + if (unlikely(!page)) { + /* Try and get at least one page */ + if (!nr_populated) + goto failed_irq; + break; + } + + /* + * Ideally this would be batched but the best way to do + * that cheaply is to first convert zone_statistics to + * be inaccurate per-cpu counter like vm_events to avoid + * a RMW cycle then do the accounting with IRQs enabled. + */ + __count_zid_vm_events(PGALLOC, zone_idx(zone), 1); + zone_statistics(ac.preferred_zoneref->zone, zone); + + prep_new_page(page, 0, gfp, 0); + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; + } + + local_irq_restore(flags); + + return nr_populated; + +failed_irq: + local_irq_restore(flags); + +failed: + page = __alloc_pages(gfp, 0, preferred_nid, nodemask); + if (page) { + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; + } + + return nr_populated; +} +EXPORT_SYMBOL_GPL(__alloc_pages_bulk); + +/* * This is the 'heart' of the zoned buddy allocator. */ -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, +struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; /* @@ -4982,33 +5172,36 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, * so bail out early if the request is out of bound. */ if (unlikely(order >= MAX_ORDER)) { - WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); + WARN_ON_ONCE(!(gfp & __GFP_NOWARN)); return NULL; } - gfp_mask &= gfp_allowed_mask; - alloc_mask = gfp_mask; - if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) + gfp &= gfp_allowed_mask; + /* + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures + * movable zones are not used during allocation. + */ + gfp = current_gfp_context(gfp); + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, + &alloc_gfp, &alloc_flags)) return NULL; /* * Forbid the first pass from falling back to types that fragment * memory until all local zones are considered. */ - alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); + alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); /* First allocation attempt */ - page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); if (likely(page)) goto out; - /* - * Apply scoped allocation constraints. This is mainly about GFP_NOFS - * resp. GFP_NOIO which has to be inherited for all allocation requests - * from a particular context which has been marked by - * memalloc_no{fs,io}_{save,restore}. - */ - alloc_mask = current_gfp_context(gfp_mask); + alloc_gfp = gfp; ac.spread_dirty_pages = false; /* @@ -5017,20 +5210,20 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, */ ac.nodemask = nodemask; - page = __alloc_pages_slowpath(alloc_mask, order, &ac); + page = __alloc_pages_slowpath(alloc_gfp, order, &ac); out: - if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && - unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) { + if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && + unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { __free_pages(page, order); page = NULL; } - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); return page; } -EXPORT_SYMBOL(__alloc_pages_nodemask); +EXPORT_SYMBOL(__alloc_pages); /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned @@ -5736,7 +5929,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) static int __parse_numa_zonelist_order(char *s) { /* - * We used to support different zonlists modes but they turned + * We used to support different zonelists modes but they turned * out to be just not useful. Let's keep the warning in place * if somebody still use the cmd line parameter so that we do * not fail it silently @@ -7477,7 +7670,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } /* - * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For * such cases we allow max_zone_pfn sorted in the descending order */ bool __weak arch_has_descending_max_zone_pfns(void) @@ -7689,7 +7882,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } -void __init mem_init_print_info(const char *str) +void __init mem_init_print_info(void) { unsigned long physpages, codesize, datasize, rosize, bss_size; unsigned long init_code_size, init_data_size; @@ -7728,17 +7921,17 @@ void __init mem_init_print_info(const char *str) #ifdef CONFIG_HIGHMEM ", %luK highmem" #endif - "%s%s)\n", + ")\n", nr_free_pages() << (PAGE_SHIFT - 10), physpages << (PAGE_SHIFT - 10), codesize >> 10, datasize >> 10, rosize >> 10, (init_data_size + init_code_size) >> 10, bss_size >> 10, (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10), - totalcma_pages << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10) #ifdef CONFIG_HIGHMEM - totalhigh_pages() << (PAGE_SHIFT - 10), + , totalhigh_pages() << (PAGE_SHIFT - 10) #endif - str ? ", " : "", str ? str : ""); + ); } /** @@ -8222,6 +8415,7 @@ void *__init alloc_large_system_hash(const char *tablename, void *table = NULL; gfp_t gfp_flags; bool virt; + bool huge; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -8289,6 +8483,7 @@ void *__init alloc_large_system_hash(const char *tablename, } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags); virt = true; + huge = is_vm_area_hugepages(table); } else { /* * If bucketsize is not a power-of-two, we may free @@ -8305,7 +8500,7 @@ void *__init alloc_large_system_hash(const char *tablename, pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, - virt ? "vmalloc" : "linear"); + virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); if (_hash_shift) *_hash_shift = log2qty; @@ -8450,6 +8645,27 @@ static unsigned long pfn_max_align_up(unsigned long pfn) pageblock_nr_pages)); } +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +/* Usage: See admin-guide/dynamic-debug-howto.rst */ +static void alloc_contig_dump_pages(struct list_head *page_list) +{ + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); + + if (DYNAMIC_DEBUG_BRANCH(descriptor)) { + struct page *page; + + dump_stack(); + list_for_each_entry(page, page_list, lru) + dump_page(page, "migration failure"); + } +} +#else +static inline void alloc_contig_dump_pages(struct list_head *page_list) +{ +} +#endif + /* [start, end) must belong to a single zone. */ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long start, unsigned long end) @@ -8464,7 +8680,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - migrate_prep(); + lru_cache_disable(); while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { @@ -8474,14 +8690,13 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; - pfn = isolate_migratepages_range(cc, pfn, end); - if (!pfn) { - ret = -EINTR; + ret = isolate_migratepages_range(cc, pfn, end); + if (ret && ret != -EAGAIN) break; - } + pfn = cc->migrate_pfn; tries = 0; } else if (++tries == 5) { - ret = ret < 0 ? ret : -EBUSY; + ret = -EBUSY; break; } @@ -8491,8 +8706,18 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + + /* + * On -ENOMEM, migrate_pages() bails out right away. It is pointless + * to retry again over this error, so do the same here. + */ + if (ret == -ENOMEM) + break; } + + lru_cache_enable(); if (ret < 0) { + alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); return ret; } @@ -8503,7 +8728,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate - * @migratetype: migratetype of the underlaying pageblocks (either + * @migratetype: migratetype of the underlying pageblocks (either * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. @@ -8583,7 +8808,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; - ret =0; + ret = 0; /* * Pages from [start, end) are within a MAX_ORDER_NR_PAGES @@ -8602,8 +8827,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, * isolated thus they won't get removed from buddy. */ - lru_add_drain_all(); - order = 0; outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { @@ -8629,8 +8852,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, 0)) { - pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", - __func__, outer_start, end); ret = -EBUSY; goto done; } @@ -8680,12 +8901,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageReserved(page)) return false; - - if (page_count(page) > 0) - return false; - - if (PageHuge(page)) - return false; } return true; } @@ -8757,9 +8972,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, } #endif /* CONFIG_CONTIG_ALLOC */ -void free_contig_range(unsigned long pfn, unsigned int nr_pages) +void free_contig_range(unsigned long pfn, unsigned long nr_pages) { - unsigned int count = 0; + unsigned long count = 0; for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -8767,13 +8982,13 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) count += page_count(page) != 1; __free_page(page); } - WARN(count != 0, "%d pages are still in use!\n", count); + WARN(count != 0, "%lu pages are still in use!\n", count); } EXPORT_SYMBOL(free_contig_range); /* * The zone indicated has a new number of managed_pages; batch sizes and percpu - * page high values need to be recalulated. + * page high values need to be recalculated. */ void __meminit zone_pcp_update(struct zone *zone) { @@ -8805,12 +9020,9 @@ void zone_pcp_enable(struct zone *zone) void zone_pcp_reset(struct zone *zone) { - unsigned long flags; int cpu; struct per_cpu_pageset *pset; - /* avoid races with drain_pages() */ - local_irq_save(flags); if (zone->pageset != &boot_pageset) { for_each_online_cpu(cpu) { pset = per_cpu_ptr(zone->pageset, cpu); @@ -8819,7 +9031,6 @@ void zone_pcp_reset(struct zone *zone) free_percpu(zone->pageset); zone->pageset = &boot_pageset; } - local_irq_restore(flags); } #ifdef CONFIG_MEMORY_HOTREMOVE |