diff options
Diffstat (limited to 'mm/mm_init.c')
| -rw-r--r-- | mm/mm_init.c | 904 |
1 files changed, 417 insertions, 487 deletions
diff --git a/mm/mm_init.c b/mm/mm_init.c index a1963c3322af..fc2a6f1e518f 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -24,14 +24,35 @@ #include <linux/page_ext.h> #include <linux/pti.h> #include <linux/pgtable.h> +#include <linux/stackdepot.h> #include <linux/swap.h> #include <linux/cma.h> +#include <linux/crash_dump.h> +#include <linux/execmem.h> +#include <linux/vmstat.h> +#include <linux/kexec_handover.h> +#include <linux/hugetlb.h> #include "internal.h" #include "slab.h" #include "shuffle.h" #include <asm/setup.h> +#ifndef CONFIG_NUMA +unsigned long max_mapnr; +EXPORT_SYMBOL(max_mapnr); + +struct page *mem_map; +EXPORT_SYMBOL(mem_map); +#endif + +/* + * high_memory defines the upper bound on direct map memory, then end + * of ZONE_NORMAL. + */ +void *high_memory; +EXPORT_SYMBOL(high_memory); + #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -50,7 +71,6 @@ void __init mminit_verify_zonelist(void) struct zonelist *zonelist; int i, listid, zoneid; - BUILD_BUG_ON(MAX_ZONELISTS > 2); for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { /* Identify the zone and nodelist */ @@ -79,9 +99,8 @@ void __init mminit_verify_pageflags_layout(void) int shift, width; unsigned long or_mask, add_mask; - shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + shift = BITS_PER_LONG; + width = shift - NR_NON_PAGEFLAG_BITS; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", SECTIONS_WIDTH, @@ -154,7 +173,6 @@ early_param("mminit_loglevel", set_mminit_loglevel); #endif /* CONFIG_DEBUG_MEMORY_INIT */ struct kobject *mm_kobj; -EXPORT_SYMBOL_GPL(mm_kobj); #ifdef CONFIG_SMP s32 vm_committed_as_batch = 32; @@ -226,7 +244,6 @@ static unsigned long required_movablecore_percent __initdata; static unsigned long nr_kernel_pages __initdata; static unsigned long nr_all_pages __initdata; -static unsigned long dma_reserve __initdata; static bool deferred_struct_pages __meminitdata; @@ -362,7 +379,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) nid = memblock_get_region_node(r); - usable_startpfn = PFN_DOWN(r->base); + usable_startpfn = memblock_region_memory_base_pfn(r); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; @@ -377,6 +394,16 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; + if (!memblock_has_mirror()) { + pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n"); + goto out; + } + + if (is_kdump_kernel()) { + pr_warn("The system is under kdump, ignore kernelcore=mirror.\n"); + goto out; + } + for_each_mem_region(r) { if (memblock_is_mirror(r)) continue; @@ -428,7 +455,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) * was requested by the user */ required_movablecore = - roundup(required_movablecore, MAX_ORDER_NR_PAGES); + round_up(required_movablecore, MAX_ORDER_NR_PAGES); required_movablecore = min(totalpages, required_movablecore); corepages = totalpages - required_movablecore; @@ -535,11 +562,11 @@ restart: out2: /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ - for (nid = 0; nid < MAX_NUMNODES; nid++) { + for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; zone_movable_pfn[nid] = - roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + round_up(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); if (zone_movable_pfn[nid] >= end_pfn) @@ -551,13 +578,13 @@ out: node_states[N_MEMORY] = saved_node_state; } -static void __meminit __init_single_page(struct page *page, unsigned long pfn, +void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); - page_mapcount_reset(page); + atomic_set(&page->_mapcount, -1); page_cpupid_reset_last(page); page_kasan_tag_reset(page); @@ -639,6 +666,29 @@ static inline void fixup_hashdist(void) static inline void fixup_hashdist(void) {} #endif /* CONFIG_NUMA */ +/* + * Initialize a reserved page unconditionally, finding its zone first. + */ +void __meminit __init_page_from_nid(unsigned long pfn, int nid) +{ + pg_data_t *pgdat; + int zid; + + pgdat = NODE_DATA(nid); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (zone_spans_pfn(zone, pfn)) + break; + } + __init_single_page(pfn_to_page(pfn), pfn, zid, nid); + + if (pageblock_aligned(pfn)) + init_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE, + false); +} + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void pgdat_set_deferred_range(pg_data_t *pgdat) { @@ -665,6 +715,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) if (early_page_ext_enabled()) return false; + + /* Always populate low zones for address-constrained allocations */ + if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) + return false; + + if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) + return true; + /* * prev_end_pfn static that contains the end of previous zone * No need to protect because called very early in boot before smp_init. @@ -674,12 +732,6 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) nr_initialised = 0; } - /* Always populate low zones for address-constrained allocations */ - if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) - return false; - - if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) - return true; /* * We start only with one section of pages, more pages are added as * needed until the rest of deferred pages are initialized. @@ -693,23 +745,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static void __meminit init_reserved_page(unsigned long pfn, int nid) +static void __meminit __init_deferred_page(unsigned long pfn, int nid) { - pg_data_t *pgdat; - int zid; - if (early_page_initialised(pfn, nid)) return; - pgdat = NODE_DATA(nid); - - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - struct zone *zone = &pgdat->node_zones[zid]; - - if (zone_spans_pfn(zone, pfn)) - break; - } - __init_single_page(pfn_to_page(pfn), pfn, zid, nid); + __init_page_from_nid(pfn, nid); } #else static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} @@ -724,11 +765,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } -static inline void init_reserved_page(unsigned long pfn, int nid) +static inline void __init_deferred_page(unsigned long pfn, int nid) { } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ +void __meminit init_deferred_page(unsigned long pfn, int nid) +{ + __init_deferred_page(pfn, nid); +} + /* * Initialised pages do not have PageReserved set. This function is * called for each range allocated by the bootmem allocator and @@ -738,25 +784,19 @@ static inline void init_reserved_page(unsigned long pfn, int nid) void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid) { - unsigned long start_pfn = PFN_DOWN(start); - unsigned long end_pfn = PFN_UP(end); - - for (; start_pfn < end_pfn; start_pfn++) { - if (pfn_valid(start_pfn)) { - struct page *page = pfn_to_page(start_pfn); + unsigned long pfn; - init_reserved_page(start_pfn, nid); + for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) { + struct page *page = pfn_to_page(pfn); - /* Avoid false-positive PageTail() */ - INIT_LIST_HEAD(&page->lru); + __init_deferred_page(pfn, nid); - /* - * no need for atomic set_bit because the struct - * page is not visible yet so nobody should - * access it yet. - */ - __SetPageReserved(page); - } + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); } } @@ -792,6 +832,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * - physical memory bank size is not necessarily the exact multiple of the * arbitrary section size * - early reserved memory may not be listed in memblock.memory + * - non-memory regions covered by the contiguous flatmem mapping * - memory layouts defined with memmap= kernel parameter may not align * nicely with memmap sections * @@ -811,18 +852,14 @@ static void __init init_unavailable_range(unsigned long spfn, unsigned long pfn; u64 pgcnt = 0; - for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(pageblock_start_pfn(pfn))) { - pfn = pageblock_end_pfn(pfn) - 1; - continue; - } + for_each_valid_pfn(pfn, spfn, epfn) { __init_single_page(pfn_to_page(pfn), pfn, zone, node); __SetPageReserved(pfn_to_page(pfn)); pgcnt++; } if (pgcnt) - pr_info("On node %d, zone %s: %lld pages in unavailable ranges", + pr_info("On node %d, zone %s: %lld pages in unavailable ranges\n", node, zone_names[zone], pgcnt); } @@ -838,7 +875,8 @@ static void __init init_unavailable_range(unsigned long spfn, void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, unsigned long zone_end_pfn, enum meminit_context context, - struct vmem_altmap *altmap, int migratetype) + struct vmem_altmap *altmap, int migratetype, + bool isolate_pageblock) { unsigned long pfn, end_pfn = start_pfn + size; struct page *page; @@ -880,8 +918,14 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); - if (context == MEMINIT_HOTPLUG) - __SetPageReserved(page); + if (context == MEMINIT_HOTPLUG) { +#ifdef CONFIG_ZONE_DEVICE + if (zone == ZONE_DEVICE) + __SetPageReserved(page); + else +#endif + __SetPageOffline(page); + } /* * Usually, we want to mark the pageblock MIGRATE_MOVABLE, @@ -889,7 +933,8 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone * over the place during system boot. */ if (pageblock_aligned(pfn)) { - set_pageblock_migratetype(page, migratetype); + init_pageblock_migratetype(page, migratetype, + isolate_pageblock); cond_resched(); } pfn++; @@ -912,7 +957,8 @@ static void __init memmap_init_zone_range(struct zone *zone, return; memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, - zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE, + false); if (*hole_pfn < start_pfn) init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); @@ -941,19 +987,19 @@ static void __init memmap_init(void) } } -#ifdef CONFIG_SPARSEMEM /* * Initialize the memory map for hole in the range [memory_end, - * section_end]. + * section_end] for SPARSEMEM and in the range [memory_end, memmap_end] + * for FLATMEM. * Append the pages in this hole to the highest zone in the last * node. - * The call to init_unavailable_range() is outside the ifdef to - * silence the compiler warining about zone_id set but not used; - * for FLATMEM it is a nop anyway */ +#ifdef CONFIG_SPARSEMEM end_pfn = round_up(end_pfn, PAGES_PER_SECTION); - if (hole_pfn < end_pfn) +#else + end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES); #endif + if (hole_pfn < end_pfn) init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); } @@ -979,7 +1025,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * and zone_device_data. It is a bug if a ZONE_DEVICE page is * ever freed or placed on a driver-private list. */ - page->pgmap = pgmap; + page_folio(page)->pgmap = pgmap; page->zone_device_data = NULL; /* @@ -993,17 +1039,30 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * because this is done early in section_activate() */ if (pageblock_aligned(pfn)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + init_pageblock_migratetype(page, MIGRATE_MOVABLE, false); cond_resched(); } /* - * ZONE_DEVICE pages are released directly to the driver page allocator - * which will set the page count to 1 when allocating the page. + * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released + * directly to the driver page allocator which will set the page count + * to 1 when allocating the page. + * + * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have + * their refcount reset to one whenever they are freed (ie. after + * their refcount drops to 0). */ - if (pgmap->type == MEMORY_DEVICE_PRIVATE || - pgmap->type == MEMORY_DEVICE_COHERENT) + switch (pgmap->type) { + case MEMORY_DEVICE_FS_DAX: + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: + case MEMORY_DEVICE_PCI_P2PDMA: set_page_count(page, 0); + break; + + case MEMORY_DEVICE_GENERIC: + break; + } } /* @@ -1020,7 +1079,7 @@ static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, if (!vmemmap_can_optimize(altmap, pgmap)) return pgmap_vmemmap_nr(pgmap); - return 2 * (PAGE_SIZE / sizeof(struct page)); + return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page)); } static void __ref memmap_init_compound(struct page *head, @@ -1032,6 +1091,12 @@ static void __ref memmap_init_compound(struct page *head, unsigned long pfn, end_pfn = head_pfn + nr_pages; unsigned int order = pgmap->vmemmap_shift; + /* + * We have to initialize the pages, including setting up page links. + * prep_compound_page() does not take care of that, so instead we + * open-code prep_compound_page() so we can take care of initializing + * the pages in the same go. + */ __SetPageHead(head); for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); @@ -1039,15 +1104,8 @@ static void __ref memmap_init_compound(struct page *head, __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); prep_compound_tail(head, pfn - head_pfn); set_page_count(page, 0); - - /* - * The first tail page stores important compound page info. - * Call prep_compound_head() after the first tail page has - * been initialized, to not have the data overwritten. - */ - if (pfn == head_pfn + 1) - prep_compound_head(head, order); } + prep_compound_head(head, order); } void __ref memmap_init_zone_device(struct zone *zone, @@ -1105,7 +1163,6 @@ void __ref memmap_init_zone_device(struct zone *zone, */ static void __init adjust_zone_range_for_zone_movable(int nid, unsigned long zone_type, - unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zone_start_pfn, unsigned long *zone_end_pfn) @@ -1134,7 +1191,7 @@ static void __init adjust_zone_range_for_zone_movable(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -unsigned long __init __absent_pages_in_range(int nid, +static unsigned long __init __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -1222,9 +1279,8 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, /* Get the start and end of the zone */ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); - adjust_zone_range_for_zone_movable(nid, zone_type, - node_start_pfn, node_end_pfn, - zone_start_pfn, zone_end_pfn); + adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn, + zone_start_pfn, zone_end_pfn); /* Check that this node has pages within the zone's required range */ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) @@ -1256,6 +1312,30 @@ static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat) pr_debug("On node %d totalpages: 0\n", pgdat->node_id); } +static void __init calc_nr_kernel_pages(void) +{ + unsigned long start_pfn, end_pfn; + phys_addr_t start_addr, end_addr; + u64 u; +#ifdef CONFIG_HIGHMEM + unsigned long high_zone_low = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; +#endif + + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { + start_pfn = PFN_UP(start_addr); + end_pfn = PFN_DOWN(end_addr); + + if (start_pfn < end_pfn) { + nr_all_pages += end_pfn - start_pfn; +#ifdef CONFIG_HIGHMEM + start_pfn = clamp(start_pfn, 0, high_zone_low); + end_pfn = clamp(end_pfn, 0, high_zone_low); +#endif + nr_kernel_pages += end_pfn - start_pfn; + } + } +} + static void __init calculate_node_totalpages(struct pglist_data *pgdat, unsigned long node_start_pfn, unsigned long node_end_pfn) @@ -1299,26 +1379,6 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat, pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); } -static unsigned long __init calc_memmap_size(unsigned long spanned_pages, - unsigned long present_pages) -{ - unsigned long pages = spanned_pages; - - /* - * Provide a more accurate estimation if there are holes within - * the zone and SPARSEMEM is in use. If there are holes within the - * zone, each populated memory region may cost us one or two extra - * memmap pages due to alignment because memmap pages for each - * populated regions may not be naturally aligned on page boundary. - * So the (present_pages >> 4) heuristic is a tradeoff for that. - */ - if (spanned_pages > present_pages + (present_pages >> 4) && - IS_ENABLED(CONFIG_SPARSEMEM)) - pages = present_pages; - - return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void pgdat_init_split_queue(struct pglist_data *pgdat) { @@ -1410,7 +1470,7 @@ void __meminit init_currently_empty_zone(struct zone *zone, #ifndef CONFIG_SPARSEMEM /* - * Calculate the size of the zone->blockflags rounded to an unsigned long + * Calculate the size of the zone->pageblock_flags rounded to an unsigned long * Start by making sure zonesize is a multiple of pageblock_order by rounding * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally * round what is now in bits to nearest long in bits, then return it in @@ -1421,12 +1481,12 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l unsigned long usemapsize; zonesize += zone_start_pfn & (pageblock_nr_pages-1); - usemapsize = roundup(zonesize, pageblock_nr_pages); + usemapsize = round_up(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; - usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); + usemapsize = round_up(usemapsize, BITS_PER_LONG); - return usemapsize / 8; + return usemapsize / BITS_PER_BYTE; } static void __ref setup_usemap(struct zone *zone) @@ -1452,7 +1512,7 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order = MAX_ORDER; + unsigned int order = PAGE_BLOCK_MAX_ORDER; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) @@ -1464,8 +1524,7 @@ void __init set_pageblock_order(void) /* * Assume the largest contiguous order of interest is a huge page. - * This value may be variable depending on boot parameters on IA64 and - * powerpc. + * This value may be variable depending on boot parameters on powerpc. */ pageblock_order = order; } @@ -1534,15 +1593,6 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) } #endif -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - * - * NOTE: pgdat should get zeroed by caller. - * NOTE: this function is only called during early init. - */ static void __init free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; @@ -1553,47 +1603,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, freesize, memmap_pages; - - size = zone->spanned_pages; - freesize = zone->present_pages; - - /* - * Adjust freesize so that it accounts for how much memory - * is used by this zone for memmap. This affects the watermark - * and per-cpu initialisations - */ - memmap_pages = calc_memmap_size(size, freesize); - if (!is_highmem_idx(j)) { - if (freesize >= memmap_pages) { - freesize -= memmap_pages; - if (memmap_pages) - pr_debug(" %s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); - } else - pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n", - zone_names[j], memmap_pages, freesize); - } - - /* Account for reserved pages */ - if (j == 0 && freesize > dma_reserve) { - freesize -= dma_reserve; - pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); - } - - if (!is_highmem_idx(j)) - nr_kernel_pages += freesize; - /* Charge for highmem memmap if there are enough kernel pages */ - else if (nr_kernel_pages > memmap_pages * 2) - nr_kernel_pages -= memmap_pages; - nr_all_pages += freesize; + unsigned long size = zone->spanned_pages; /* - * Set an approximate value for lowmem here, it will be adjusted - * when the bootmem allocator frees pages into the buddy system. - * And all highmem pages will be managed by the buddy system. + * Initialize zone->managed_pages as 0 , it will be reset + * when memblock allocator frees pages into buddy system. */ - zone_init_internals(zone, j, nid, freesize); + zone_init_internals(zone, j, nid, zone->present_pages); if (!size) continue; @@ -1608,13 +1624,17 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, { void *ptr; + /* + * Kmemleak will explicitly scan mem_map by traversing all valid + * `struct *page`,so memblock does not need to be added to the scan list. + */ if (exact_nid) ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, - MEMBLOCK_ALLOC_ACCESSIBLE, + MEMBLOCK_ALLOC_NOLEAKTRACE, nid); else ptr = memblock_alloc_try_nid_raw(size, align, min_addr, - MEMBLOCK_ALLOC_ACCESSIBLE, + MEMBLOCK_ALLOC_NOLEAKTRACE, nid); if (ptr && size > 0) @@ -1626,8 +1646,8 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, #ifdef CONFIG_FLATMEM static void __init alloc_node_mem_map(struct pglist_data *pgdat) { - unsigned long __maybe_unused start = 0; - unsigned long __maybe_unused offset = 0; + unsigned long start, offset, size, end; + struct page *map; /* Skip empty nodes */ if (!pgdat->node_spanned_pages) @@ -1635,39 +1655,32 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; - /* ia64 gets its own node_mem_map, before this, without bootmem */ - if (!pgdat->node_mem_map) { - unsigned long size, end; - struct page *map; - - /* - * The zone's endpoints aren't required to be MAX_ORDER - * aligned but the node_mem_map endpoints must be in order - * for the buddy allocator to function correctly. - */ - end = pgdat_end_pfn(pgdat); - end = ALIGN(end, MAX_ORDER_NR_PAGES); - size = (end - start) * sizeof(struct page); - map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, - pgdat->node_id, false); - if (!map) - panic("Failed to allocate %ld bytes for node %d memory map\n", - size, pgdat->node_id); - pgdat->node_mem_map = map + offset; - } - pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", - __func__, pgdat->node_id, (unsigned long)pgdat, - (unsigned long)pgdat->node_mem_map); -#ifndef CONFIG_NUMA /* - * With no DISCONTIG, the global mem_map is just set as node 0's + * The zone's endpoints aren't required to be MAX_PAGE_ORDER + * aligned but the node_mem_map endpoints must be in order + * for the buddy allocator to function correctly. */ - if (pgdat == NODE_DATA(0)) { - mem_map = NODE_DATA(0)->node_mem_map; - if (page_to_pfn(mem_map) != pgdat->node_start_pfn) - mem_map -= offset; - } -#endif + end = ALIGN(pgdat_end_pfn(pgdat), MAX_ORDER_NR_PAGES); + size = (end - start) * sizeof(struct page); + map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, + pgdat->node_id, false); + if (!map) + panic("Failed to allocate %ld bytes for node %d memory map\n", + size, pgdat->node_id); + pgdat->node_mem_map = map + offset; + memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); + pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", + __func__, pgdat->node_id, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); + + /* the global mem_map is just set as node 0's */ + WARN_ON(pgdat != NODE_DATA(0)); + + mem_map = pgdat->node_mem_map; + if (page_to_pfn(mem_map) != pgdat->node_start_pfn) + mem_map -= offset; + + max_mapnr = end - start; } #else static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } @@ -1681,8 +1694,7 @@ static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } * * It returns the start and end page frame of a node based on information * provided by memblock_set_node(). If called for a node - * with no available memory, a warning is printed and the start and end - * PFNs will be 0. + * with no available memory, the start and end PFNs will be 0. */ void __init get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) @@ -1737,7 +1749,7 @@ static void __init free_area_init_node(int nid) } /* Any regular or high memory on that node ? */ -static void check_for_memory(pg_data_t *pgdat) +static void __init check_for_memory(pg_data_t *pgdat) { enum zone_type zone_type; @@ -1775,6 +1787,27 @@ static bool arch_has_descending_max_zone_pfns(void) return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40); } +static void __init set_high_memory(void) +{ + phys_addr_t highmem = memblock_end_of_DRAM(); + + /* + * Some architectures (e.g. ARM) set high_memory very early and + * use it in arch setup code. + * If an architecture already set high_memory don't overwrite it + */ + if (high_memory) + return; + +#ifdef CONFIG_HIGHMEM + if (arch_has_descending_max_zone_pfns() || + highmem > PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])) + highmem = PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]); +#endif + + high_memory = phys_to_virt(highmem - 1) + 1; +} + /** * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone @@ -1869,43 +1902,36 @@ void __init free_area_init(unsigned long *max_zone_pfn) for_each_node(nid) { pg_data_t *pgdat; - if (!node_online(nid)) { - pr_info("Initializing node %d as memoryless\n", nid); - - /* Allocator not initialized yet */ - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - panic("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - arch_refresh_nodedata(nid, pgdat); - free_area_init_node(nid); - - /* - * We do not want to confuse userspace by sysfs - * files/directories for node without any memory - * attached to it, so this node is not marked as - * N_MEMORY and not marked online so that no sysfs - * hierarchy will be created via register_one_node for - * it. The pgdat will get fully initialized by - * hotadd_init_pgdat() when memory is hotplugged into - * this node. - */ - continue; - } + if (!node_online(nid)) + alloc_offline_node_data(nid); pgdat = NODE_DATA(nid); free_area_init_node(nid); - /* Any memory on that node */ - if (pgdat->node_present_pages) + /* + * No sysfs hierarchy will be created via register_node() + *for memory-less node because here it's not marked as N_MEMORY + *and won't be set online later. The benefit is userspace + *program won't be confused by sysfs files/directories of + *memory-less node. The pgdat will get fully initialized by + *hotadd_init_pgdat() when memory is hotplugged into this node. + */ + if (pgdat->node_present_pages) { node_set_state(nid, N_MEMORY); - check_for_memory(pgdat); + check_for_memory(pgdat); + } } + for_each_node_state(nid, N_MEMORY) + sparse_vmemmap_init_nid_late(nid); + + calc_nr_kernel_pages(); memmap_init(); /* disable hash distribution for systems with a single node */ fixup_hashdist(); + + set_high_memory(); } /** @@ -1959,8 +1985,8 @@ unsigned long __init node_map_pfn_alignment(void) } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void __init deferred_free_range(unsigned long pfn, - unsigned long nr_pages) +static void __init deferred_free_pages(unsigned long pfn, + unsigned long nr_pages) { struct page *page; unsigned long i; @@ -1973,18 +1999,20 @@ static void __init deferred_free_range(unsigned long pfn, /* Free a large naturally-aligned chunk if possible */ if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { for (i = 0; i < nr_pages; i += pageblock_nr_pages) - set_pageblock_migratetype(page + i, MIGRATE_MOVABLE); - __free_pages_core(page, MAX_ORDER); + init_pageblock_migratetype(page + i, MIGRATE_MOVABLE, + false); + __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY); return; } - /* Accept chunks smaller than MAX_ORDER upfront */ - accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); + /* Accept chunks smaller than MAX_PAGE_ORDER upfront */ + accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE); for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_core(page, 0); + init_pageblock_migratetype(page, MIGRATE_MOVABLE, + false); + __free_pages_core(page, 0, MEMINIT_EARLY); } } @@ -1999,180 +2027,87 @@ static inline void __init pgdat_init_report_one_done(void) } /* - * Returns true if page needs to be initialized or freed to buddy allocator. - * - * We check if a current MAX_ORDER block is valid by only checking the validity - * of the head pfn. - */ -static inline bool __init deferred_pfn_valid(unsigned long pfn) -{ - if (IS_MAX_ORDER_ALIGNED(pfn) && !pfn_valid(pfn)) - return false; - return true; -} - -/* - * Free pages to buddy allocator. Try to free aligned pages in - * MAX_ORDER_NR_PAGES sizes. - */ -static void __init deferred_free_pages(unsigned long pfn, - unsigned long end_pfn) -{ - unsigned long nr_free = 0; - - for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(pfn)) { - deferred_free_range(pfn - nr_free, nr_free); - nr_free = 0; - } else if (IS_MAX_ORDER_ALIGNED(pfn)) { - deferred_free_range(pfn - nr_free, nr_free); - nr_free = 1; - } else { - nr_free++; - } - } - /* Free the last block of pages to allocator */ - deferred_free_range(pfn - nr_free, nr_free); -} - -/* * Initialize struct pages. We minimize pfn page lookups and scheduler checks * by performing it only once every MAX_ORDER_NR_PAGES. * Return number of pages initialized. */ -static unsigned long __init deferred_init_pages(struct zone *zone, - unsigned long pfn, - unsigned long end_pfn) +static unsigned long __init deferred_init_pages(struct zone *zone, + unsigned long pfn, unsigned long end_pfn) { int nid = zone_to_nid(zone); - unsigned long nr_pages = 0; + unsigned long nr_pages = end_pfn - pfn; int zid = zone_idx(zone); - struct page *page = NULL; + struct page *page = pfn_to_page(pfn); - for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(pfn)) { - page = NULL; - continue; - } else if (!page || IS_MAX_ORDER_ALIGNED(pfn)) { - page = pfn_to_page(pfn); - } else { - page++; - } + for (; pfn < end_pfn; pfn++, page++) __init_single_page(page, pfn, zid, nid); - nr_pages++; - } - return (nr_pages); -} - -/* - * This function is meant to pre-load the iterator for the zone init. - * Specifically it walks through the ranges until we are caught up to the - * first_init_pfn value and exits there. If we never encounter the value we - * return false indicating there are no valid ranges left. - */ -static bool __init -deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, - unsigned long *spfn, unsigned long *epfn, - unsigned long first_init_pfn) -{ - u64 j; - - /* - * Start out by walking through the ranges in this zone that have - * already been initialized. We don't need to do anything with them - * so we just need to flush them out of the system. - */ - for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { - if (*epfn <= first_init_pfn) - continue; - if (*spfn < first_init_pfn) - *spfn = first_init_pfn; - *i = j; - return true; - } - - return false; + return nr_pages; } /* - * Initialize and free pages. We do it in two loops: first we initialize - * struct page, then free to buddy allocator, because while we are - * freeing pages we can access pages that are ahead (computing buddy - * page in __free_one_page()). + * Initialize and free pages. * - * In order to try and keep some memory in the cache we have the loop - * broken along max page order boundaries. This way we will not cause - * any issues with the buddy page computation. + * At this point reserved pages and struct pages that correspond to holes in + * memblock.memory are already intialized so every free range has a valid + * memory map around it. + * This ensures that access of pages that are ahead of the range being + * initialized (computing buddy page in __free_one_page()) always reads a valid + * struct page. + * + * In order to try and improve CPU cache locality we have the loop broken along + * max page order boundaries. */ static unsigned long __init -deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, - unsigned long *end_pfn) +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, + struct zone *zone) { - unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); - unsigned long spfn = *start_pfn, epfn = *end_pfn; + int nid = zone_to_nid(zone); unsigned long nr_pages = 0; - u64 j = *i; + phys_addr_t start, end; + u64 i = 0; - /* First we loop through and initialize the page values */ - for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { - unsigned long t; + for_each_free_mem_range(i, nid, 0, &start, &end, NULL) { + unsigned long spfn = PFN_UP(start); + unsigned long epfn = PFN_DOWN(end); - if (mo_pfn <= *start_pfn) + if (spfn >= end_pfn) break; - t = min(mo_pfn, *end_pfn); - nr_pages += deferred_init_pages(zone, *start_pfn, t); + spfn = max(spfn, start_pfn); + epfn = min(epfn, end_pfn); - if (mo_pfn < *end_pfn) { - *start_pfn = mo_pfn; - break; - } - } - - /* Reset values and now loop through freeing pages as needed */ - swap(j, *i); + while (spfn < epfn) { + unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES); + unsigned long chunk_end = min(mo_pfn, epfn); - for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { - unsigned long t; - - if (mo_pfn <= spfn) - break; + nr_pages += deferred_init_pages(zone, spfn, chunk_end); + deferred_free_pages(spfn, chunk_end - spfn); - t = min(mo_pfn, epfn); - deferred_free_pages(spfn, t); + spfn = chunk_end; - if (mo_pfn <= epfn) - break; + if (irqs_disabled()) + touch_nmi_watchdog(); + else + cond_resched(); + } } return nr_pages; } static void __init -deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, - void *arg) +deferred_init_memmap_job(unsigned long start_pfn, unsigned long end_pfn, + void *arg) { - unsigned long spfn, epfn; struct zone *zone = arg; - u64 i; - deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); - - /* - * Initialize and free pages in MAX_ORDER sized increments so that we - * can avoid introducing any issues with the buddy allocator. - */ - while (spfn < end_pfn) { - deferred_init_maxorder(&i, zone, &spfn, &epfn); - cond_resched(); - } + deferred_init_memmap_chunk(start_pfn, end_pfn, zone); } -/* An arch may override for more concurrency. */ -__weak int __init +static unsigned int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask) { - return 1; + return max(cpumask_weight(node_cpumask), 1U); } /* Initialise remaining memory on a node */ @@ -2180,12 +2115,10 @@ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - unsigned long spfn = 0, epfn = 0; - unsigned long first_init_pfn, flags; + int max_threads = deferred_page_init_max_threads(cpumask); + unsigned long first_init_pfn, last_pfn, flags; unsigned long start = jiffies; struct zone *zone; - int zid, max_threads; - u64 i; /* Bind memory initialisation thread to a local node if possible */ if (!cpumask_empty(cpumask)) @@ -2211,39 +2144,25 @@ static int __init deferred_init_memmap(void *data) */ pgdat_resize_unlock(pgdat, &flags); - /* Only the highest zone is deferred so find it */ - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - zone = pgdat->node_zones + zid; - if (first_init_pfn < zone_end_pfn(zone)) - break; - } + /* Only the highest zone is deferred */ + zone = pgdat->node_zones + pgdat->nr_zones - 1; + last_pfn = SECTION_ALIGN_UP(zone_end_pfn(zone)); - /* If the zone is empty somebody else may have cleared out the zone */ - if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, - first_init_pfn)) - goto zone_empty; + struct padata_mt_job job = { + .thread_fn = deferred_init_memmap_job, + .fn_arg = zone, + .start = first_init_pfn, + .size = last_pfn - first_init_pfn, + .align = PAGES_PER_SECTION, + .min_chunk = PAGES_PER_SECTION, + .max_threads = max_threads, + .numa_aware = false, + }; - max_threads = deferred_page_init_max_threads(cpumask); + padata_do_multithreaded(&job); - while (spfn < epfn) { - unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); - struct padata_mt_job job = { - .thread_fn = deferred_init_memmap_chunk, - .fn_arg = zone, - .start = spfn, - .size = epfn_align - spfn, - .align = PAGES_PER_SECTION, - .min_chunk = PAGES_PER_SECTION, - .max_threads = max_threads, - }; - - padata_do_multithreaded(&job); - deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, - epfn_align); - } -zone_empty: /* Sanity check that the next zone really is unpopulated */ - WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); + WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone)); pr_info("node %d deferred pages initialised in %ums\n", pgdat->node_id, jiffies_to_msecs(jiffies - start)); @@ -2262,19 +2181,14 @@ zone_empty: * Return true when zone was grown, otherwise return false. We return true even * when we grow less than requested, to let the caller decide if there are * enough pages to satisfy the allocation. - * - * Note: We use noinline because this function is needed only during boot, and - * it is called from a __ref function _deferred_grow_zone. This way we are - * making sure that it is not inlined into permanent text section. */ bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { - unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); + unsigned long nr_pages_needed = SECTION_ALIGN_UP(1 << order); pg_data_t *pgdat = zone->zone_pgdat; unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; unsigned long spfn, epfn, flags; unsigned long nr_pages = 0; - u64 i; /* Only the last zone may have deferred pages */ if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) @@ -2291,37 +2205,26 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order) return true; } - /* If the zone is empty somebody else may have cleared out the zone */ - if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, - first_deferred_pfn)) { - pgdat->first_deferred_pfn = ULONG_MAX; - pgdat_resize_unlock(pgdat, &flags); - /* Retry only once. */ - return first_deferred_pfn != ULONG_MAX; + /* + * Initialize at least nr_pages_needed in section chunks. + * If a section has less free memory than nr_pages_needed, the next + * section will be also initialized. + * Note, that it still does not guarantee that allocation of order can + * be satisfied if the sections are fragmented because of memblock + * allocations. + */ + for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1); + nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone); + spfn = epfn, epfn += PAGES_PER_SECTION) { + nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone); } /* - * Initialize and free pages in MAX_ORDER sized increments so - * that we can avoid introducing any issues with the buddy - * allocator. + * There were no pages to initialize and free which means the zone's + * memory map is completely initialized. */ - while (spfn < epfn) { - /* update our first deferred PFN for this section */ - first_deferred_pfn = spfn; - - nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); - touch_nmi_watchdog(); - - /* We should only stop along section boundaries */ - if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) - continue; + pgdat->first_deferred_pfn = nr_pages ? spfn : ULONG_MAX; - /* If our quota has been met we can stop here */ - if (nr_pages >= nr_pages_needed) - break; - } - - pgdat->first_deferred_pfn = spfn; pgdat_resize_unlock(pgdat, &flags); return nr_pages > 0; @@ -2340,13 +2243,24 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_count(p, 0); } while (++p, --i); - set_pageblock_migratetype(page, MIGRATE_CMA); + init_pageblock_migratetype(page, MIGRATE_CMA, false); set_page_refcounted(page); + /* pages were reserved and not allocated */ + clear_page_tag_ref(page); __free_pages(page, pageblock_order); adjust_managed_page_count(page, pageblock_nr_pages); page_zone(page)->cma_pages += pageblock_nr_pages; } +/* + * Similar to above, but only set the migrate type and stats. + */ +void __init init_cma_pageblock(struct page *page) +{ + init_pageblock_migratetype(page, MIGRATE_CMA, false); + adjust_managed_page_count(page, pageblock_nr_pages); + page_zone(page)->cma_pages += pageblock_nr_pages; +} #endif void set_zone_contiguous(struct zone *zone) @@ -2371,6 +2285,32 @@ void set_zone_contiguous(struct zone *zone) zone->contiguous = true; } +/* + * Check if a PFN range intersects multiple zones on one or more + * NUMA nodes. Specify the @nid argument if it is known that this + * PFN range is on one node, NUMA_NO_NODE otherwise. + */ +bool pfn_range_intersects_zones(int nid, unsigned long start_pfn, + unsigned long nr_pages) +{ + struct zone *zone, *izone = NULL; + + for_each_zone(zone) { + if (nid != NUMA_NO_NODE && zone_to_nid(zone) != nid) + continue; + + if (zone_intersects(zone, start_pfn, nr_pages)) { + if (izone != NULL) + return true; + izone = zone; + } + + } + + return false; +} + +static void __init mem_init_print_info(void); void __init page_alloc_init_late(void) { struct zone *zone; @@ -2397,6 +2337,8 @@ void __init page_alloc_init_late(void) files_maxfiles_init(); #endif + /* Accounting of total+free memory is stable at this point. */ + mem_init_print_info(); buffer_init(); /* Discard memblock private memory */ @@ -2415,17 +2357,6 @@ void __init page_alloc_init_late(void) page_alloc_sysctl_init(); } -#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES -/* - * Returns the number of pages that arch has reserved but - * is not known to alloc_large_system_hash(). - */ -static unsigned long __init arch_reserved_kernel_pages(void) -{ - return 0; -} -#endif - /* * Adaptive scale is meant to reduce sizes of hash tables on large memory * machines. As memory size is increased the scale is also increased but at @@ -2468,7 +2399,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; - numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ if (PAGE_SIZE < SZ_1M) @@ -2490,15 +2420,7 @@ void *__init alloc_large_system_hash(const char *tablename, else numentries <<= (PAGE_SHIFT - scale); - /* Make sure we've got at least a 0-order allocation.. */ - if (unlikely(flags & HASH_SMALL)) { - /* Makes no sense without HASH_EARLY */ - WARN_ON(!(flags & HASH_EARLY)); - if (!(numentries >> *_hash_shift)) { - numentries = 1UL << *_hash_shift; - BUG_ON(!numentries); - } - } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + if (unlikely((numentries * bucketsize) < PAGE_SIZE)) numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); @@ -2527,7 +2449,7 @@ void *__init alloc_large_system_hash(const char *tablename, else table = memblock_alloc_raw(size, SMP_CACHE_BYTES); - } else if (get_order(size) > MAX_ORDER || hashdist) { + } else if (get_order(size) > MAX_PAGE_ORDER || hashdist) { table = vmalloc_huge(size, gfp_flags); virt = true; if (table) @@ -2547,7 +2469,7 @@ void *__init alloc_large_system_hash(const char *tablename, panic("Failed to allocate %s hash table\n", tablename); pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, + tablename, 1UL << log2qty, get_order(size), size, virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); if (_hash_shift) @@ -2558,26 +2480,9 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/** - * set_dma_reserve - set the specified number of pages reserved in the first zone - * @new_dma_reserve: The number of pages to mark reserved - * - * The per-cpu batchsize and zone watermarks are determined by managed_pages. - * In the DMA zone, a significant percentage may be consumed by kernel image - * and other unfreeable allocations which can skew the watermarks badly. This - * function may optionally be used to account for unfreeable pages in the - * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and - * smaller per-cpu batchsize. - */ -void __init set_dma_reserve(unsigned long new_dma_reserve) -{ - dma_reserve = new_dma_reserve; -} - void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) { - if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) { int nid = early_pfn_to_nid(pfn); @@ -2589,7 +2494,10 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, /* KMSAN will take care of these pages. */ return; } - __free_pages_core(page, order); + + /* pages were reserved and not allocated */ + clear_page_tag_ref(page); + __free_pages_core(page, order, MEMINIT_EARLY); } DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); @@ -2617,6 +2525,14 @@ early_param("init_on_free", early_init_on_free); DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); +static bool check_pages_enabled_early __initdata; + +static int __init early_check_pages(char *buf) +{ + return kstrtobool(buf, &check_pages_enabled_early); +} +early_param("check_pages", early_check_pages); + /* * Enable static keys related to various memory debugging and hardening options. * Some override others, and depend on early params that are evaluated in the @@ -2626,7 +2542,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); static void __init mem_debugging_and_hardening_init(void) { bool page_poisoning_requested = false; - bool want_check_pages = false; + bool want_check_pages = check_pages_enabled_early; #ifdef CONFIG_PAGE_POISONING /* @@ -2696,18 +2612,12 @@ static void __init report_meminit(void) stack = "all(pattern)"; else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO)) stack = "all(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL)) - stack = "byref_all(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF)) - stack = "byref(zero)"; - else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER)) - stack = "__user(zero)"; else stack = "off"; pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", - stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off", - want_init_on_free() ? "on" : "off"); + stack, str_on_off(want_init_on_alloc(GFP_KERNEL)), + str_on_off(want_init_on_free())); if (want_init_on_free()) pr_info("mem auto-init: clearing system memory may take some time...\n"); } @@ -2763,27 +2673,46 @@ static void __init mem_init_print_info(void) ); } +void __init __weak arch_mm_preinit(void) +{ +} + +void __init __weak mem_init(void) +{ +} + /* * Set up kernel memory allocators */ void __init mm_core_init(void) { + arch_mm_preinit(); + hugetlb_bootmem_alloc(); + /* Initializations relying on SMP setup */ + BUILD_BUG_ON(MAX_ZONELISTS > 2); build_all_zonelists(NULL); page_alloc_init_cpuhp(); - + alloc_tag_sec_init(); /* * page_ext requires contiguous pages, - * bigger than MAX_ORDER unless SPARSEMEM. + * bigger than MAX_PAGE_ORDER unless SPARSEMEM. */ page_ext_init_flatmem(); mem_debugging_and_hardening_init(); - kfence_alloc_pool(); + kfence_alloc_pool_and_metadata(); report_meminit(); kmsan_init_shadow(); stack_depot_early_init(); + + /* + * KHO memory setup must happen while memblock is still active, but + * as close as possible to buddy initialization + */ + kho_memory_init(); + + memblock_free_all(); mem_init(); - mem_init_print_info(); kmem_cache_init(); /* * page_owner must be initialized after buddy is ready, and also after @@ -2804,4 +2733,5 @@ void __init mm_core_init(void) pti_init(); kmsan_init_runtime(); mm_cache_init(); + execmem_init(); } |
