summaryrefslogtreecommitdiff
path: root/mm/mm_init.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mm_init.c')
-rw-r--r--mm/mm_init.c654
1 files changed, 329 insertions, 325 deletions
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 3ec04933f7fd..fc2a6f1e518f 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -29,12 +29,30 @@
#include <linux/cma.h>
#include <linux/crash_dump.h>
#include <linux/execmem.h>
+#include <linux/vmstat.h>
+#include <linux/kexec_handover.h>
+#include <linux/hugetlb.h>
#include "internal.h"
#include "slab.h"
#include "shuffle.h"
#include <asm/setup.h>
+#ifndef CONFIG_NUMA
+unsigned long max_mapnr;
+EXPORT_SYMBOL(max_mapnr);
+
+struct page *mem_map;
+EXPORT_SYMBOL(mem_map);
+#endif
+
+/*
+ * high_memory defines the upper bound on direct map memory, then end
+ * of ZONE_NORMAL.
+ */
+void *high_memory;
+EXPORT_SYMBOL(high_memory);
+
#ifdef CONFIG_DEBUG_MEMORY_INIT
int __meminitdata mminit_loglevel;
@@ -53,7 +71,6 @@ void __init mminit_verify_zonelist(void)
struct zonelist *zonelist;
int i, listid, zoneid;
- BUILD_BUG_ON(MAX_ZONELISTS > 2);
for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
/* Identify the zone and nodelist */
@@ -83,8 +100,7 @@ void __init mminit_verify_pageflags_layout(void)
unsigned long or_mask, add_mask;
shift = BITS_PER_LONG;
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
+ width = shift - NR_NON_PAGEFLAG_BITS;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
@@ -363,7 +379,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nid = memblock_get_region_node(r);
- usable_startpfn = PFN_DOWN(r->base);
+ usable_startpfn = memblock_region_memory_base_pfn(r);
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
min(usable_startpfn, zone_movable_pfn[nid]) :
usable_startpfn;
@@ -439,7 +455,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
* was requested by the user
*/
required_movablecore =
- roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+ round_up(required_movablecore, MAX_ORDER_NR_PAGES);
required_movablecore = min(totalpages, required_movablecore);
corepages = totalpages - required_movablecore;
@@ -546,11 +562,11 @@ restart:
out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
zone_movable_pfn[nid] =
- roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+ round_up(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
if (zone_movable_pfn[nid] >= end_pfn)
@@ -568,7 +584,7 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn,
mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
- page_mapcount_reset(page);
+ atomic_set(&page->_mapcount, -1);
page_cpupid_reset_last(page);
page_kasan_tag_reset(page);
@@ -650,6 +666,29 @@ static inline void fixup_hashdist(void)
static inline void fixup_hashdist(void) {}
#endif /* CONFIG_NUMA */
+/*
+ * Initialize a reserved page unconditionally, finding its zone first.
+ */
+void __meminit __init_page_from_nid(unsigned long pfn, int nid)
+{
+ pg_data_t *pgdat;
+ int zid;
+
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (zone_spans_pfn(zone, pfn))
+ break;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
+
+ if (pageblock_aligned(pfn))
+ init_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE,
+ false);
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
{
@@ -676,6 +715,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
if (early_page_ext_enabled())
return false;
+
+ /* Always populate low zones for address-constrained allocations */
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
+ return false;
+
+ if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
+ return true;
+
/*
* prev_end_pfn static that contains the end of previous zone
* No need to protect because called very early in boot before smp_init.
@@ -685,12 +732,6 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
nr_initialised = 0;
}
- /* Always populate low zones for address-constrained allocations */
- if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
- return false;
-
- if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
- return true;
/*
* We start only with one section of pages, more pages are added as
* needed until the rest of deferred pages are initialized.
@@ -704,23 +745,12 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_reserved_page(unsigned long pfn, int nid)
+static void __meminit __init_deferred_page(unsigned long pfn, int nid)
{
- pg_data_t *pgdat;
- int zid;
-
if (early_page_initialised(pfn, nid))
return;
- pgdat = NODE_DATA(nid);
-
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- struct zone *zone = &pgdat->node_zones[zid];
-
- if (zone_spans_pfn(zone, pfn))
- break;
- }
- __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
+ __init_page_from_nid(pfn, nid);
}
#else
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
@@ -735,11 +765,16 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_reserved_page(unsigned long pfn, int nid)
+static inline void __init_deferred_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+void __meminit init_deferred_page(unsigned long pfn, int nid)
+{
+ __init_deferred_page(pfn, nid);
+}
+
/*
* Initialised pages do not have PageReserved set. This function is
* called for each range allocated by the bootmem allocator and
@@ -749,25 +784,19 @@ static inline void init_reserved_page(unsigned long pfn, int nid)
void __meminit reserve_bootmem_region(phys_addr_t start,
phys_addr_t end, int nid)
{
- unsigned long start_pfn = PFN_DOWN(start);
- unsigned long end_pfn = PFN_UP(end);
-
- for (; start_pfn < end_pfn; start_pfn++) {
- if (pfn_valid(start_pfn)) {
- struct page *page = pfn_to_page(start_pfn);
+ unsigned long pfn;
- init_reserved_page(start_pfn, nid);
+ for_each_valid_pfn(pfn, PFN_DOWN(start), PFN_UP(end)) {
+ struct page *page = pfn_to_page(pfn);
- /* Avoid false-positive PageTail() */
- INIT_LIST_HEAD(&page->lru);
+ __init_deferred_page(pfn, nid);
- /*
- * no need for atomic set_bit because the struct
- * page is not visible yet so nobody should
- * access it yet.
- */
- __SetPageReserved(page);
- }
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
}
}
@@ -803,7 +832,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* - physical memory bank size is not necessarily the exact multiple of the
* arbitrary section size
* - early reserved memory may not be listed in memblock.memory
- * - non-memory regions covered by the contigious flatmem mapping
+ * - non-memory regions covered by the contiguous flatmem mapping
* - memory layouts defined with memmap= kernel parameter may not align
* nicely with memmap sections
*
@@ -823,11 +852,7 @@ static void __init init_unavailable_range(unsigned long spfn,
unsigned long pfn;
u64 pgcnt = 0;
- for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(pageblock_start_pfn(pfn))) {
- pfn = pageblock_end_pfn(pfn) - 1;
- continue;
- }
+ for_each_valid_pfn(pfn, spfn, epfn) {
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
@@ -850,7 +875,8 @@ static void __init init_unavailable_range(unsigned long spfn,
void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, unsigned long zone_end_pfn,
enum meminit_context context,
- struct vmem_altmap *altmap, int migratetype)
+ struct vmem_altmap *altmap, int migratetype,
+ bool isolate_pageblock)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
@@ -892,8 +918,14 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
- if (context == MEMINIT_HOTPLUG)
- __SetPageReserved(page);
+ if (context == MEMINIT_HOTPLUG) {
+#ifdef CONFIG_ZONE_DEVICE
+ if (zone == ZONE_DEVICE)
+ __SetPageReserved(page);
+ else
+#endif
+ __SetPageOffline(page);
+ }
/*
* Usually, we want to mark the pageblock MIGRATE_MOVABLE,
@@ -901,7 +933,8 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
* over the place during system boot.
*/
if (pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, migratetype);
+ init_pageblock_migratetype(page, migratetype,
+ isolate_pageblock);
cond_resched();
}
pfn++;
@@ -924,7 +957,8 @@ static void __init memmap_init_zone_range(struct zone *zone,
return;
memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
- zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+ zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE,
+ false);
if (*hole_pfn < start_pfn)
init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
@@ -953,19 +987,19 @@ static void __init memmap_init(void)
}
}
-#ifdef CONFIG_SPARSEMEM
/*
* Initialize the memory map for hole in the range [memory_end,
- * section_end].
+ * section_end] for SPARSEMEM and in the range [memory_end, memmap_end]
+ * for FLATMEM.
* Append the pages in this hole to the highest zone in the last
* node.
- * The call to init_unavailable_range() is outside the ifdef to
- * silence the compiler warining about zone_id set but not used;
- * for FLATMEM it is a nop anyway
*/
+#ifdef CONFIG_SPARSEMEM
end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
- if (hole_pfn < end_pfn)
+#else
+ end_pfn = round_up(end_pfn, MAX_ORDER_NR_PAGES);
#endif
+ if (hole_pfn < end_pfn)
init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
}
@@ -991,7 +1025,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
* ever freed or placed on a driver-private list.
*/
- page->pgmap = pgmap;
+ page_folio(page)->pgmap = pgmap;
page->zone_device_data = NULL;
/*
@@ -1005,17 +1039,30 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* because this is done early in section_activate()
*/
if (pageblock_aligned(pfn)) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ init_pageblock_migratetype(page, MIGRATE_MOVABLE, false);
cond_resched();
}
/*
- * ZONE_DEVICE pages are released directly to the driver page allocator
- * which will set the page count to 1 when allocating the page.
+ * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
+ * directly to the driver page allocator which will set the page count
+ * to 1 when allocating the page.
+ *
+ * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
+ * their refcount reset to one whenever they are freed (ie. after
+ * their refcount drops to 0).
*/
- if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- pgmap->type == MEMORY_DEVICE_COHERENT)
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_FS_DAX:
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
+ case MEMORY_DEVICE_PCI_P2PDMA:
set_page_count(page, 0);
+ break;
+
+ case MEMORY_DEVICE_GENERIC:
+ break;
+ }
}
/*
@@ -1044,6 +1091,12 @@ static void __ref memmap_init_compound(struct page *head,
unsigned long pfn, end_pfn = head_pfn + nr_pages;
unsigned int order = pgmap->vmemmap_shift;
+ /*
+ * We have to initialize the pages, including setting up page links.
+ * prep_compound_page() does not take care of that, so instead we
+ * open-code prep_compound_page() so we can take care of initializing
+ * the pages in the same go.
+ */
__SetPageHead(head);
for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
struct page *page = pfn_to_page(pfn);
@@ -1051,15 +1104,8 @@ static void __ref memmap_init_compound(struct page *head,
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
prep_compound_tail(head, pfn - head_pfn);
set_page_count(page, 0);
-
- /*
- * The first tail page stores important compound page info.
- * Call prep_compound_head() after the first tail page has
- * been initialized, to not have the data overwritten.
- */
- if (pfn == head_pfn + 1)
- prep_compound_head(head, order);
}
+ prep_compound_head(head, order);
}
void __ref memmap_init_zone_device(struct zone *zone,
@@ -1424,7 +1470,7 @@ void __meminit init_currently_empty_zone(struct zone *zone,
#ifndef CONFIG_SPARSEMEM
/*
- * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Calculate the size of the zone->pageblock_flags rounded to an unsigned long
* Start by making sure zonesize is a multiple of pageblock_order by rounding
* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
* round what is now in bits to nearest long in bits, then return it in
@@ -1435,10 +1481,10 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
unsigned long usemapsize;
zonesize += zone_start_pfn & (pageblock_nr_pages-1);
- usemapsize = roundup(zonesize, pageblock_nr_pages);
+ usemapsize = round_up(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
- usemapsize = roundup(usemapsize, BITS_PER_LONG);
+ usemapsize = round_up(usemapsize, BITS_PER_LONG);
return usemapsize / BITS_PER_BYTE;
}
@@ -1466,7 +1512,7 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
- unsigned int order = MAX_PAGE_ORDER;
+ unsigned int order = PAGE_BLOCK_MAX_ORDER;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
@@ -1578,13 +1624,17 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
{
void *ptr;
+ /*
+ * Kmemleak will explicitly scan mem_map by traversing all valid
+ * `struct *page`,so memblock does not need to be added to the scan list.
+ */
if (exact_nid)
ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
- MEMBLOCK_ALLOC_ACCESSIBLE,
+ MEMBLOCK_ALLOC_NOLEAKTRACE,
nid);
else
ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
- MEMBLOCK_ALLOC_ACCESSIBLE,
+ MEMBLOCK_ALLOC_NOLEAKTRACE,
nid);
if (ptr && size > 0)
@@ -1606,7 +1656,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
offset = pgdat->node_start_pfn - start;
/*
- * The zone's endpoints aren't required to be MAX_PAGE_ORDER
+ * The zone's endpoints aren't required to be MAX_PAGE_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
@@ -1618,17 +1668,19 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
panic("Failed to allocate %ld bytes for node %d memory map\n",
size, pgdat->node_id);
pgdat->node_mem_map = map + offset;
+ memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
__func__, pgdat->node_id, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
-#ifndef CONFIG_NUMA
+
/* the global mem_map is just set as node 0's */
- if (pgdat == NODE_DATA(0)) {
- mem_map = NODE_DATA(0)->node_mem_map;
- if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= offset;
- }
-#endif
+ WARN_ON(pgdat != NODE_DATA(0));
+
+ mem_map = pgdat->node_mem_map;
+ if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
+ mem_map -= offset;
+
+ max_mapnr = end - start;
}
#else
static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
@@ -1735,6 +1787,27 @@ static bool arch_has_descending_max_zone_pfns(void)
return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
}
+static void __init set_high_memory(void)
+{
+ phys_addr_t highmem = memblock_end_of_DRAM();
+
+ /*
+ * Some architectures (e.g. ARM) set high_memory very early and
+ * use it in arch setup code.
+ * If an architecture already set high_memory don't overwrite it
+ */
+ if (high_memory)
+ return;
+
+#ifdef CONFIG_HIGHMEM
+ if (arch_has_descending_max_zone_pfns() ||
+ highmem > PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]))
+ highmem = PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]);
+#endif
+
+ high_memory = phys_to_virt(highmem - 1) + 1;
+}
+
/**
* free_area_init - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
@@ -1829,20 +1902,14 @@ void __init free_area_init(unsigned long *max_zone_pfn)
for_each_node(nid) {
pg_data_t *pgdat;
- if (!node_online(nid)) {
- /* Allocator not initialized yet */
- pgdat = arch_alloc_nodedata(nid);
- if (!pgdat)
- panic("Cannot allocate %zuB for node %d.\n",
- sizeof(*pgdat), nid);
- arch_refresh_nodedata(nid, pgdat);
- }
+ if (!node_online(nid))
+ alloc_offline_node_data(nid);
pgdat = NODE_DATA(nid);
free_area_init_node(nid);
/*
- * No sysfs hierarcy will be created via register_one_node()
+ * No sysfs hierarchy will be created via register_node()
*for memory-less node because here it's not marked as N_MEMORY
*and won't be set online later. The benefit is userspace
*program won't be confused by sysfs files/directories of
@@ -1855,11 +1922,16 @@ void __init free_area_init(unsigned long *max_zone_pfn)
}
}
+ for_each_node_state(nid, N_MEMORY)
+ sparse_vmemmap_init_nid_late(nid);
+
calc_nr_kernel_pages();
memmap_init();
/* disable hash distribution for systems with a single node */
fixup_hashdist();
+
+ set_high_memory();
}
/**
@@ -1913,8 +1985,8 @@ unsigned long __init node_map_pfn_alignment(void)
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(unsigned long pfn,
- unsigned long nr_pages)
+static void __init deferred_free_pages(unsigned long pfn,
+ unsigned long nr_pages)
{
struct page *page;
unsigned long i;
@@ -1927,18 +1999,20 @@ static void __init deferred_free_range(unsigned long pfn,
/* Free a large naturally-aligned chunk if possible */
if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
for (i = 0; i < nr_pages; i += pageblock_nr_pages)
- set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
- __free_pages_core(page, MAX_PAGE_ORDER);
+ init_pageblock_migratetype(page + i, MIGRATE_MOVABLE,
+ false);
+ __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
return;
}
/* Accept chunks smaller than MAX_PAGE_ORDER upfront */
- accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
+ accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE);
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_core(page, 0);
+ init_pageblock_migratetype(page, MIGRATE_MOVABLE,
+ false);
+ __free_pages_core(page, 0, MEMINIT_EARLY);
}
}
@@ -1953,180 +2027,87 @@ static inline void __init pgdat_init_report_one_done(void)
}
/*
- * Returns true if page needs to be initialized or freed to buddy allocator.
- *
- * We check if a current MAX_PAGE_ORDER block is valid by only checking the
- * validity of the head pfn.
- */
-static inline bool __init deferred_pfn_valid(unsigned long pfn)
-{
- if (IS_MAX_ORDER_ALIGNED(pfn) && !pfn_valid(pfn))
- return false;
- return true;
-}
-
-/*
- * Free pages to buddy allocator. Try to free aligned pages in
- * MAX_ORDER_NR_PAGES sizes.
- */
-static void __init deferred_free_pages(unsigned long pfn,
- unsigned long end_pfn)
-{
- unsigned long nr_free = 0;
-
- for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(pfn)) {
- deferred_free_range(pfn - nr_free, nr_free);
- nr_free = 0;
- } else if (IS_MAX_ORDER_ALIGNED(pfn)) {
- deferred_free_range(pfn - nr_free, nr_free);
- nr_free = 1;
- } else {
- nr_free++;
- }
- }
- /* Free the last block of pages to allocator */
- deferred_free_range(pfn - nr_free, nr_free);
-}
-
-/*
* Initialize struct pages. We minimize pfn page lookups and scheduler checks
* by performing it only once every MAX_ORDER_NR_PAGES.
* Return number of pages initialized.
*/
-static unsigned long __init deferred_init_pages(struct zone *zone,
- unsigned long pfn,
- unsigned long end_pfn)
+static unsigned long __init deferred_init_pages(struct zone *zone,
+ unsigned long pfn, unsigned long end_pfn)
{
int nid = zone_to_nid(zone);
- unsigned long nr_pages = 0;
+ unsigned long nr_pages = end_pfn - pfn;
int zid = zone_idx(zone);
- struct page *page = NULL;
+ struct page *page = pfn_to_page(pfn);
- for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(pfn)) {
- page = NULL;
- continue;
- } else if (!page || IS_MAX_ORDER_ALIGNED(pfn)) {
- page = pfn_to_page(pfn);
- } else {
- page++;
- }
+ for (; pfn < end_pfn; pfn++, page++)
__init_single_page(page, pfn, zid, nid);
- nr_pages++;
- }
return nr_pages;
}
/*
- * This function is meant to pre-load the iterator for the zone init.
- * Specifically it walks through the ranges until we are caught up to the
- * first_init_pfn value and exits there. If we never encounter the value we
- * return false indicating there are no valid ranges left.
- */
-static bool __init
-deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
- unsigned long *spfn, unsigned long *epfn,
- unsigned long first_init_pfn)
-{
- u64 j;
-
- /*
- * Start out by walking through the ranges in this zone that have
- * already been initialized. We don't need to do anything with them
- * so we just need to flush them out of the system.
- */
- for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
- if (*epfn <= first_init_pfn)
- continue;
- if (*spfn < first_init_pfn)
- *spfn = first_init_pfn;
- *i = j;
- return true;
- }
-
- return false;
-}
-
-/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, then free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
+ * Initialize and free pages.
*
- * In order to try and keep some memory in the cache we have the loop
- * broken along max page order boundaries. This way we will not cause
- * any issues with the buddy page computation.
+ * At this point reserved pages and struct pages that correspond to holes in
+ * memblock.memory are already intialized so every free range has a valid
+ * memory map around it.
+ * This ensures that access of pages that are ahead of the range being
+ * initialized (computing buddy page in __free_one_page()) always reads a valid
+ * struct page.
+ *
+ * In order to try and improve CPU cache locality we have the loop broken along
+ * max page order boundaries.
*/
static unsigned long __init
-deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
- unsigned long *end_pfn)
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+ struct zone *zone)
{
- unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
- unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
- u64 j = *i;
+ phys_addr_t start, end;
+ u64 i = 0;
- /* First we loop through and initialize the page values */
- for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
- unsigned long t;
+ for_each_free_mem_range(i, nid, 0, &start, &end, NULL) {
+ unsigned long spfn = PFN_UP(start);
+ unsigned long epfn = PFN_DOWN(end);
- if (mo_pfn <= *start_pfn)
+ if (spfn >= end_pfn)
break;
- t = min(mo_pfn, *end_pfn);
- nr_pages += deferred_init_pages(zone, *start_pfn, t);
+ spfn = max(spfn, start_pfn);
+ epfn = min(epfn, end_pfn);
- if (mo_pfn < *end_pfn) {
- *start_pfn = mo_pfn;
- break;
- }
- }
+ while (spfn < epfn) {
+ unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long chunk_end = min(mo_pfn, epfn);
- /* Reset values and now loop through freeing pages as needed */
- swap(j, *i);
+ nr_pages += deferred_init_pages(zone, spfn, chunk_end);
+ deferred_free_pages(spfn, chunk_end - spfn);
- for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
- unsigned long t;
+ spfn = chunk_end;
- if (mo_pfn <= spfn)
- break;
-
- t = min(mo_pfn, epfn);
- deferred_free_pages(spfn, t);
-
- if (mo_pfn <= epfn)
- break;
+ if (irqs_disabled())
+ touch_nmi_watchdog();
+ else
+ cond_resched();
+ }
}
return nr_pages;
}
static void __init
-deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
- void *arg)
+deferred_init_memmap_job(unsigned long start_pfn, unsigned long end_pfn,
+ void *arg)
{
- unsigned long spfn, epfn;
struct zone *zone = arg;
- u64 i;
-
- deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
- /*
- * Initialize and free pages in MAX_PAGE_ORDER sized increments so that
- * we can avoid introducing any issues with the buddy allocator.
- */
- while (spfn < end_pfn) {
- deferred_init_maxorder(&i, zone, &spfn, &epfn);
- cond_resched();
- }
+ deferred_init_memmap_chunk(start_pfn, end_pfn, zone);
}
-/* An arch may override for more concurrency. */
-__weak int __init
+static unsigned int __init
deferred_page_init_max_threads(const struct cpumask *node_cpumask)
{
- return 1;
+ return max(cpumask_weight(node_cpumask), 1U);
}
/* Initialise remaining memory on a node */
@@ -2134,12 +2115,10 @@ static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- unsigned long spfn = 0, epfn = 0;
- unsigned long first_init_pfn, flags;
+ int max_threads = deferred_page_init_max_threads(cpumask);
+ unsigned long first_init_pfn, last_pfn, flags;
unsigned long start = jiffies;
struct zone *zone;
- int zid, max_threads;
- u64 i;
/* Bind memory initialisation thread to a local node if possible */
if (!cpumask_empty(cpumask))
@@ -2165,40 +2144,25 @@ static int __init deferred_init_memmap(void *data)
*/
pgdat_resize_unlock(pgdat, &flags);
- /* Only the highest zone is deferred so find it */
- for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- zone = pgdat->node_zones + zid;
- if (first_init_pfn < zone_end_pfn(zone))
- break;
- }
-
- /* If the zone is empty somebody else may have cleared out the zone */
- if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- first_init_pfn))
- goto zone_empty;
+ /* Only the highest zone is deferred */
+ zone = pgdat->node_zones + pgdat->nr_zones - 1;
+ last_pfn = SECTION_ALIGN_UP(zone_end_pfn(zone));
- max_threads = deferred_page_init_max_threads(cpumask);
+ struct padata_mt_job job = {
+ .thread_fn = deferred_init_memmap_job,
+ .fn_arg = zone,
+ .start = first_init_pfn,
+ .size = last_pfn - first_init_pfn,
+ .align = PAGES_PER_SECTION,
+ .min_chunk = PAGES_PER_SECTION,
+ .max_threads = max_threads,
+ .numa_aware = false,
+ };
- while (spfn < epfn) {
- unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
- struct padata_mt_job job = {
- .thread_fn = deferred_init_memmap_chunk,
- .fn_arg = zone,
- .start = spfn,
- .size = epfn_align - spfn,
- .align = PAGES_PER_SECTION,
- .min_chunk = PAGES_PER_SECTION,
- .max_threads = max_threads,
- .numa_aware = false,
- };
+ padata_do_multithreaded(&job);
- padata_do_multithreaded(&job);
- deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- epfn_align);
- }
-zone_empty:
/* Sanity check that the next zone really is unpopulated */
- WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
+ WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone));
pr_info("node %d deferred pages initialised in %ums\n",
pgdat->node_id, jiffies_to_msecs(jiffies - start));
@@ -2220,12 +2184,11 @@ zone_empty:
*/
bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
{
- unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+ unsigned long nr_pages_needed = SECTION_ALIGN_UP(1 << order);
pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
unsigned long spfn, epfn, flags;
unsigned long nr_pages = 0;
- u64 i;
/* Only the last zone may have deferred pages */
if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
@@ -2242,37 +2205,26 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
return true;
}
- /* If the zone is empty somebody else may have cleared out the zone */
- if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
- first_deferred_pfn)) {
- pgdat->first_deferred_pfn = ULONG_MAX;
- pgdat_resize_unlock(pgdat, &flags);
- /* Retry only once. */
- return first_deferred_pfn != ULONG_MAX;
+ /*
+ * Initialize at least nr_pages_needed in section chunks.
+ * If a section has less free memory than nr_pages_needed, the next
+ * section will be also initialized.
+ * Note, that it still does not guarantee that allocation of order can
+ * be satisfied if the sections are fragmented because of memblock
+ * allocations.
+ */
+ for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1);
+ nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone);
+ spfn = epfn, epfn += PAGES_PER_SECTION) {
+ nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone);
}
/*
- * Initialize and free pages in MAX_PAGE_ORDER sized increments so
- * that we can avoid introducing any issues with the buddy
- * allocator.
+ * There were no pages to initialize and free which means the zone's
+ * memory map is completely initialized.
*/
- while (spfn < epfn) {
- /* update our first deferred PFN for this section */
- first_deferred_pfn = spfn;
+ pgdat->first_deferred_pfn = nr_pages ? spfn : ULONG_MAX;
- nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
- touch_nmi_watchdog();
-
- /* We should only stop along section boundaries */
- if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
- continue;
-
- /* If our quota has been met we can stop here */
- if (nr_pages >= nr_pages_needed)
- break;
- }
-
- pgdat->first_deferred_pfn = spfn;
pgdat_resize_unlock(pgdat, &flags);
return nr_pages > 0;
@@ -2291,13 +2243,24 @@ void __init init_cma_reserved_pageblock(struct page *page)
set_page_count(p, 0);
} while (++p, --i);
- set_pageblock_migratetype(page, MIGRATE_CMA);
+ init_pageblock_migratetype(page, MIGRATE_CMA, false);
set_page_refcounted(page);
+ /* pages were reserved and not allocated */
+ clear_page_tag_ref(page);
__free_pages(page, pageblock_order);
adjust_managed_page_count(page, pageblock_nr_pages);
page_zone(page)->cma_pages += pageblock_nr_pages;
}
+/*
+ * Similar to above, but only set the migrate type and stats.
+ */
+void __init init_cma_pageblock(struct page *page)
+{
+ init_pageblock_migratetype(page, MIGRATE_CMA, false);
+ adjust_managed_page_count(page, pageblock_nr_pages);
+ page_zone(page)->cma_pages += pageblock_nr_pages;
+}
#endif
void set_zone_contiguous(struct zone *zone)
@@ -2322,6 +2285,32 @@ void set_zone_contiguous(struct zone *zone)
zone->contiguous = true;
}
+/*
+ * Check if a PFN range intersects multiple zones on one or more
+ * NUMA nodes. Specify the @nid argument if it is known that this
+ * PFN range is on one node, NUMA_NO_NODE otherwise.
+ */
+bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ struct zone *zone, *izone = NULL;
+
+ for_each_zone(zone) {
+ if (nid != NUMA_NO_NODE && zone_to_nid(zone) != nid)
+ continue;
+
+ if (zone_intersects(zone, start_pfn, nr_pages)) {
+ if (izone != NULL)
+ return true;
+ izone = zone;
+ }
+
+ }
+
+ return false;
+}
+
+static void __init mem_init_print_info(void);
void __init page_alloc_init_late(void)
{
struct zone *zone;
@@ -2348,6 +2337,8 @@ void __init page_alloc_init_late(void)
files_maxfiles_init();
#endif
+ /* Accounting of total+free memory is stable at this point. */
+ mem_init_print_info();
buffer_init();
/* Discard memblock private memory */
@@ -2478,7 +2469,7 @@ void *__init alloc_large_system_hash(const char *tablename,
panic("Failed to allocate %s hash table\n", tablename);
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
+ tablename, 1UL << log2qty, get_order(size), size,
virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
if (_hash_shift)
@@ -2505,16 +2496,8 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
}
/* pages were reserved and not allocated */
- if (mem_alloc_profiling_enabled()) {
- union codetag_ref *ref = get_page_tag_ref(page);
-
- if (ref) {
- set_codetag_empty(ref);
- put_page_tag_ref(ref);
- }
- }
-
- __free_pages_core(page, order);
+ clear_page_tag_ref(page);
+ __free_pages_core(page, order, MEMINIT_EARLY);
}
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
@@ -2542,6 +2525,14 @@ early_param("init_on_free", early_init_on_free);
DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
+static bool check_pages_enabled_early __initdata;
+
+static int __init early_check_pages(char *buf)
+{
+ return kstrtobool(buf, &check_pages_enabled_early);
+}
+early_param("check_pages", early_check_pages);
+
/*
* Enable static keys related to various memory debugging and hardening options.
* Some override others, and depend on early params that are evaluated in the
@@ -2551,7 +2542,7 @@ DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
static void __init mem_debugging_and_hardening_init(void)
{
bool page_poisoning_requested = false;
- bool want_check_pages = false;
+ bool want_check_pages = check_pages_enabled_early;
#ifdef CONFIG_PAGE_POISONING
/*
@@ -2621,18 +2612,12 @@ static void __init report_meminit(void)
stack = "all(pattern)";
else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
stack = "all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
- stack = "byref_all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
- stack = "byref(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
- stack = "__user(zero)";
else
stack = "off";
pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
- stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
- want_init_on_free() ? "on" : "off");
+ stack, str_on_off(want_init_on_alloc(GFP_KERNEL)),
+ str_on_off(want_init_on_free()));
if (want_init_on_free())
pr_info("mem auto-init: clearing system memory may take some time...\n");
}
@@ -2688,15 +2673,27 @@ static void __init mem_init_print_info(void)
);
}
+void __init __weak arch_mm_preinit(void)
+{
+}
+
+void __init __weak mem_init(void)
+{
+}
+
/*
* Set up kernel memory allocators
*/
void __init mm_core_init(void)
{
+ arch_mm_preinit();
+ hugetlb_bootmem_alloc();
+
/* Initializations relying on SMP setup */
+ BUILD_BUG_ON(MAX_ZONELISTS > 2);
build_all_zonelists(NULL);
page_alloc_init_cpuhp();
-
+ alloc_tag_sec_init();
/*
* page_ext requires contiguous pages,
* bigger than MAX_PAGE_ORDER unless SPARSEMEM.
@@ -2707,8 +2704,15 @@ void __init mm_core_init(void)
report_meminit();
kmsan_init_shadow();
stack_depot_early_init();
+
+ /*
+ * KHO memory setup must happen while memblock is still active, but
+ * as close as possible to buddy initialization
+ */
+ kho_memory_init();
+
+ memblock_free_all();
mem_init();
- mem_init_print_info();
kmem_cache_init();
/*
* page_owner must be initialized after buddy is ready, and also after