From 8e7a7f8619f1f93736d9bb7e31caf4721bdc739d Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 30 Jun 2015 14:56:41 -0700 Subject: memblock: introduce a for_each_reserved_mem_region iterator Struct page initialisation had been identified as one of the reasons why large machines take a long time to boot. Patches were posted a long time ago to defer initialisation until they were first used. This was rejected on the grounds it should not be necessary to hurt the fast paths. This series reuses much of the work from that time but defers the initialisation of memory to kswapd so that one thread per node initialises memory local to that node. After applying the series and setting the appropriate Kconfig variable I see this in the boot log on a 64G machine [ 7.383764] kswapd 0 initialised deferred memory in 188ms [ 7.404253] kswapd 1 initialised deferred memory in 208ms [ 7.411044] kswapd 3 initialised deferred memory in 216ms [ 7.411551] kswapd 2 initialised deferred memory in 216ms On a 1TB machine, I see [ 8.406511] kswapd 3 initialised deferred memory in 1116ms [ 8.428518] kswapd 1 initialised deferred memory in 1140ms [ 8.435977] kswapd 0 initialised deferred memory in 1148ms [ 8.437416] kswapd 2 initialised deferred memory in 1148ms Once booted the machine appears to work as normal. Boot times were measured from the time shutdown was called until ssh was available again. In the 64G case, the boot time savings are negligible. On the 1TB machine, the savings were 16 seconds. Nate Zimmer said: : On an older 8 TB box with lots and lots of cpus the boot time, as : measure from grub to login prompt, the boot time improved from 1484 : seconds to exactly 1000 seconds. Waiman Long said: : I ran a bootup timing test on a 12-TB 16-socket IvyBridge-EX system. From : grub menu to ssh login, the bootup time was 453s before the patch and 265s : after the patch - a saving of 188s (42%). Daniel Blueman said: : On a 7TB, 1728-core NumaConnect system with 108 NUMA nodes, we're seeing : stock 4.0 boot in 7136s. This drops to 2159s, or a 70% reduction with : this patchset. Non-temporal PMD init (https://lkml.org/lkml/2015/4/23/350) : drops this to 1045s. This patch (of 13): As part of initializing struct page's in 2MiB chunks, we noticed that at the end of free_all_bootmem(), there was nothing which had forced the reserved/allocated 4KiB pages to be initialized. This helper function will be used for that expansion. Signed-off-by: Robin Holt Signed-off-by: Nate Zimmer Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 1b444c730846..114df622853f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -819,6 +819,38 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) } +/** + * __next_reserved_mem_region - next function for for_each_reserved_region() + * @idx: pointer to u64 loop variable + * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL + * + * Iterate over all reserved memory regions. + */ +void __init_memblock __next_reserved_mem_region(u64 *idx, + phys_addr_t *out_start, + phys_addr_t *out_end) +{ + struct memblock_type *rsv = &memblock.reserved; + + if (*idx >= 0 && *idx < rsv->cnt) { + struct memblock_region *r = &rsv->regions[*idx]; + phys_addr_t base = r->base; + phys_addr_t size = r->size; + + if (out_start) + *out_start = base; + if (out_end) + *out_end = base + size - 1; + + *idx += 1; + return; + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; +} + /** * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable -- cgit From 1e8ce83cd17fd0f549a7ad145ddd2bfcdd7dfe37 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 30 Jun 2015 14:56:45 -0700 Subject: mm: meminit: move page initialization into a separate function Currently, memmap_init_zone() has all the smarts for initializing a single page. A subset of this is required for parallel page initialisation and so this patch breaks up the monolithic function in preparation. Signed-off-by: Robin Holt Signed-off-by: Nathan Zimmer Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 79 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5e6fa06f2784..bc5da2cdfc84 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -764,6 +764,51 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return 0; } +static void __meminit __init_single_page(struct page *page, unsigned long pfn, + unsigned long zone, int nid) +{ + struct zone *z = &NODE_DATA(nid)->node_zones[zone]; + + set_page_links(page, zone, nid, pfn); + mminit_verify_page_links(page, zone, nid, pfn); + init_page_count(page); + page_mapcount_reset(page); + page_cpupid_reset_last(page); + SetPageReserved(page); + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. Later some blocks near + * the start are marked MIGRATE_RESERVE by + * setup_zone_migrate_reserve() + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. + */ + if ((z->zone_start_pfn <= pfn) + && (pfn < zone_end_pfn(z)) + && !(pfn & (pageblock_nr_pages - 1))) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif +} + +static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, + int nid) +{ + return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); +} + static bool free_pages_prepare(struct page *page, unsigned int order) { bool compound = PageCompound(page); @@ -4212,7 +4257,6 @@ static void setup_zone_migrate_reserve(struct zone *zone) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - struct page *page; unsigned long end_pfn = start_pfn + size; unsigned long pfn; struct zone *z; @@ -4233,38 +4277,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (!early_pfn_in_nid(pfn, nid)) continue; } - page = pfn_to_page(pfn); - set_page_links(page, zone, nid, pfn); - mminit_verify_page_links(page, zone, nid, pfn); - init_page_count(page); - page_mapcount_reset(page); - page_cpupid_reset_last(page); - SetPageReserved(page); - /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. Later some blocks near - * the start are marked MIGRATE_RESERVE by - * setup_zone_migrate_reserve() - * - * bitmap is created for zone's valid pfn range. but memmap - * can be created for invalid pages (for alignment) - * check here not to call set_pageblock_migratetype() against - * pfn out of zone. - */ - if ((z->zone_start_pfn <= pfn) - && (pfn < zone_end_pfn(z)) - && !(pfn & (pageblock_nr_pages - 1))) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - - INIT_LIST_HEAD(&page->lru); -#ifdef WANT_PAGE_VIRTUAL - /* The shift won't overflow because ZONE_NORMAL is below 4G. */ - if (!is_highmem_idx(zone)) - set_page_address(page, __va(pfn << PAGE_SHIFT)); -#endif + __init_single_pfn(pfn, zone, nid); } } -- cgit From 92923ca3aacef63c92dc297a75ad0c6dfe4eab37 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Tue, 30 Jun 2015 14:56:48 -0700 Subject: mm: meminit: only set page reserved in the memblock region Currently each page struct is set as reserved upon initialization. This patch leaves the reserved bit clear and only sets the reserved bit when it is known the memory was allocated by the bootmem allocator. This makes it easier to distinguish between uninitialised struct pages and reserved struct pages in later patches. Signed-off-by: Robin Holt Signed-off-by: Nathan Zimmer Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 3 +++ mm/page_alloc.c | 17 ++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 5258386fa1be..4af8f88c2bd1 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -130,6 +130,9 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); + for_each_reserved_mem_region(i, &start, &end) + reserve_bootmem_region(start, end); + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) count += __free_memory_core(start, end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bc5da2cdfc84..39c8d56a4056 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -774,7 +774,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, init_page_count(page); page_mapcount_reset(page); page_cpupid_reset_last(page); - SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for @@ -809,6 +808,22 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); } +/* + * Initialised pages do not have PageReserved set. This function is + * called for each range allocated by the bootmem allocator and + * marks the pages PageReserved. The remaining valid pages are later + * sent to the buddy page allocator. + */ +void reserve_bootmem_region(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + for (; start_pfn < end_pfn; start_pfn++) + if (pfn_valid(start_pfn)) + SetPageReserved(pfn_to_page(start_pfn)); +} + static bool free_pages_prepare(struct page *page, unsigned int order) { bool compound = PageCompound(page); -- cgit From d70ddd7a5d9aa335f9b4b0c3d879e1e70ee1e4e3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:56:52 -0700 Subject: mm: page_alloc: pass PFN to __free_pages_bootmem __free_pages_bootmem prepares a page for release to the buddy allocator and assumes that the struct page is initialised. Parallel initialisation of struct pages defers initialisation and __free_pages_bootmem can be called for struct pages that cannot yet map struct page to PFN. This patch passes PFN to __free_pages_bootmem with no other functional change. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 13 +++++++------ mm/internal.h | 3 ++- mm/memblock.c | 2 +- mm/nobootmem.c | 4 ++-- mm/page_alloc.c | 3 ++- 5 files changed, 14 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 477be696511d..a23dd1934654 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -164,7 +164,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) end = PFN_DOWN(physaddr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -172,7 +172,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; - unsigned long *map, start, end, pages, count = 0; + unsigned long *map, start, end, pages, cur, count = 0; if (!bdata->node_bootmem_map) return 0; @@ -210,17 +210,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { - unsigned long cur = start; + cur = start; start = ALIGN(start + 1, BITS_PER_LONG); while (vec && cur != start) { if (vec & 1) { page = pfn_to_page(cur); - __free_pages_bootmem(page, 0); + __free_pages_bootmem(page, cur, 0); count++; } vec >>= 1; @@ -229,12 +229,13 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) } } + cur = bdata->node_min_pfn; page = virt_to_page(bdata->node_bootmem_map); pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; while (pages--) - __free_pages_bootmem(page++, 0); + __free_pages_bootmem(page++, cur++, 0); bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); diff --git a/mm/internal.h b/mm/internal.h index a25e359a4039..58e9022e3757 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -155,7 +155,8 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) } extern int __isolate_free_page(struct page *page, unsigned int order); -extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); diff --git a/mm/memblock.c b/mm/memblock.c index 114df622853f..87108e77e476 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1419,7 +1419,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 4af8f88c2bd1..e57cf24babd6 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -86,7 +86,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -101,7 +101,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) while (start + (1UL << order) > end) order--; - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); start += (1UL << order); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 39c8d56a4056..c2ee4ecad083 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -878,7 +878,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -void __init __free_pages_bootmem(struct page *page, unsigned int order) +void __init __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; -- cgit From 8a942fdea560d4ac0e9d9fabcd5201ad20e0c382 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:56:55 -0700 Subject: mm: meminit: make __early_pfn_to_nid SMP-safe and introduce meminit_pfn_in_nid __early_pfn_to_nid() use static variables to cache recent lookups as memblock lookups are very expensive but it assumes that memory initialisation is single-threaded. Parallel initialisation of struct pages will break that assumption so this patch makes __early_pfn_to_nid() SMP-safe by requiring the caller to cache recent search information. early_pfn_to_nid() keeps the same interface but is only safe to use early in boot due to the use of a global static variable. meminit_pfn_in_nid() is an SMP-safe version that callers must maintain their own state for. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2ee4ecad083..ffdb2308848d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4551,39 +4551,41 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ -int __meminit __early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { unsigned long start_pfn, end_pfn; int nid; - /* - * NOTE: The following SMP-unsafe globals are only used early in boot - * when the kernel is running single-threaded. - */ - static unsigned long __meminitdata last_start_pfn, last_end_pfn; - static int __meminitdata last_nid; - if (last_start_pfn <= pfn && pfn < last_end_pfn) - return last_nid; + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); if (nid != -1) { - last_start_pfn = start_pfn; - last_end_pfn = end_pfn; - last_nid = nid; + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; } return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +/* Only safe to use early in boot when initialisation is single-threaded */ int __meminit early_pfn_to_nid(unsigned long pfn) { int nid; - nid = __early_pfn_to_nid(pfn); + /* The system will behave unpredictably otherwise */ + BUG_ON(system_state != SYSTEM_BOOTING); + + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid >= 0) return nid; /* just returns 0 */ @@ -4591,15 +4593,23 @@ int __meminit early_pfn_to_nid(unsigned long pfn) } #ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { int nid; - nid = __early_pfn_to_nid(pfn); + nid = __early_pfn_to_nid(pfn, state); if (nid >= 0 && nid != node) return false; return true; } + +/* Only safe to use early in boot when initialisation is single-threaded */ +bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); +} + #endif /** -- cgit From 75a592a47129dcfc1aec40e7d3cdf239a767d441 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:56:59 -0700 Subject: mm: meminit: inline some helper functions early_pfn_in_nid() and meminit_pfn_in_nid() are small functions that are unnecessarily visible outside memory initialisation. As well as unnecessary visibility, it's unnecessary function call overhead when initialising pages. This patch moves the helpers inline. [akpm@linux-foundation.org: fix build] [mhocko@suse.cz: fix build] Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 89 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ffdb2308848d..12a81870815f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -899,6 +899,58 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, __free_pages(page, order); } +#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ + defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +/* Only safe to use early in boot when initialisation is single-threaded */ +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + int nid; + + /* The system will behave unpredictably otherwise */ + BUG_ON(system_state != SYSTEM_BOOTING); + + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); + if (nid >= 0) + return nid; + /* just returns 0 */ + return 0; +} +#endif + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + int nid; + + nid = __early_pfn_to_nid(pfn, state); + if (nid >= 0 && nid != node) + return false; + return true; +} + +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); +} + +#else + +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return true; +} +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + return true; +} +#endif + + #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) @@ -4575,43 +4627,6 @@ int __meminit __early_pfn_to_nid(unsigned long pfn, } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; - -/* Only safe to use early in boot when initialisation is single-threaded */ -int __meminit early_pfn_to_nid(unsigned long pfn) -{ - int nid; - - /* The system will behave unpredictably otherwise */ - BUG_ON(system_state != SYSTEM_BOOTING); - - nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; -} - -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) -{ - int nid; - - nid = __early_pfn_to_nid(pfn, state); - if (nid >= 0 && nid != node) - return false; - return true; -} - -/* Only safe to use early in boot when initialisation is single-threaded */ -bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); -} - -#endif - /** * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. -- cgit From 3a80a7fa7989fbb6aa56bb6ad31811b62cf99e60 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:02 -0700 Subject: mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set This patch initalises all low memory struct pages and 2G of the highest zone on each node during memory initialisation if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set. That config option cannot be set but will be available in a later patch. Parallel initialisation of struct page depends on some features from memory hotplug and it is necessary to alter alter section annotations. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 18 +++++++++++++ mm/internal.h | 18 +++++++++++++ mm/page_alloc.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 111 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index c180af880ed5..e79de2bd12cd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -636,3 +636,21 @@ config MAX_STACK_SIZE_MB changed to a smaller value in which case that is used. A sane initial value is 80 MB. + +# For architectures that support deferred memory initialisation +config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + bool + +config DEFERRED_STRUCT_PAGE_INIT + bool "Defer initialisation of struct pages to kswapd" + default n + depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + depends on MEMORY_HOTPLUG + help + Ordinarily all struct pages are initialised during early boot in a + single thread. On very large machines this can take a considerable + amount of time. If this option is set, large machines will bring up + a subset of memmap at boot and then initialise the rest in parallel + when kswapd starts. This has a potential performance impact on + processes running early in the lifetime of the systemm until kswapd + finishes the initialisation. diff --git a/mm/internal.h b/mm/internal.h index 58e9022e3757..88ac7be741ca 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -387,6 +387,24 @@ static inline void mminit_verify_zonelist(void) } #endif /* CONFIG_DEBUG_MEMORY_INIT */ +/* + * Deferred struct page initialisation requires init functions that are freed + * before kswapd is available. Reuse the memory hotplug section annotation + * to mark the required code. + * + * __defermem_init is code that always exists but is annotated __meminit to + * avoid section warnings. + * __defer_init code gets marked __meminit when deferring struct page + * initialistion but is otherwise in the init section. + */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +#define __defermem_init __meminit +#define __defer_init __meminit +#else +#define __defermem_init +#define __defer_init __init +#endif + /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ #if defined(CONFIG_SPARSEMEM) extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12a81870815f..7af45b2e8870 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -235,6 +235,64 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ + pgdat->first_deferred_pfn = ULONG_MAX; +} + +/* Returns true if the struct page for the pfn is uninitialised */ +static inline bool __defermem_init early_page_uninitialised(unsigned long pfn) +{ + int nid = early_pfn_to_nid(pfn); + + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +/* + * Returns false when the remaining initialisation should be deferred until + * later in the boot cycle when it can be parallelised. + */ +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + /* Always populate low zones for address-contrained allocations */ + if (zone_end < pgdat_end_pfn(pgdat)) + return true; + + /* Initialise at least 2G of the highest zone */ + (*nr_initialised)++; + if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + pgdat->first_deferred_pfn = pfn; + return false; + } + + return true; +} +#else +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ +} + +static inline bool early_page_uninitialised(unsigned long pfn) +{ + return false; +} + +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + return true; +} +#endif + + void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled && @@ -878,8 +936,8 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -void __init __free_pages_bootmem(struct page *page, unsigned long pfn, - unsigned int order) +static void __defer_init __free_pages_boot_core(struct page *page, + unsigned long pfn, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -951,6 +1009,14 @@ static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, #endif +void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order) +{ + if (early_page_uninitialised(pfn)) + return; + return __free_pages_boot_core(page, pfn, order); +} + #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) @@ -4325,14 +4391,16 @@ static void setup_zone_migrate_reserve(struct zone *zone) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { + pg_data_t *pgdat = NODE_DATA(nid); unsigned long end_pfn = start_pfn + size; unsigned long pfn; struct zone *z; + unsigned long nr_initialised = 0; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &NODE_DATA(nid)->node_zones[zone]; + z = &pgdat->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s @@ -4344,6 +4412,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; if (!early_pfn_in_nid(pfn, nid)) continue; + if (!update_defer_init(pgdat, pfn, end_pfn, + &nr_initialised)) + break; } __init_single_pfn(pfn, zone, nid); } @@ -5144,6 +5215,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -- cgit From 7e18adb4f80bea90d30b62158694d97c31f71d37 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:05 -0700 Subject: mm: meminit: initialise remaining struct pages in parallel with kswapd Only a subset of struct pages are initialised at the moment. When this patch is applied kswapd initialise the remaining struct pages in parallel. This should boot faster by spreading the work to multiple CPUs and initialising data that is local to the CPU. The user-visible effect on large machines is that free memory will appear to rapidly increase early in the lifetime of the system until kswapd reports that all memory is initialised in the kernel log. Once initialised there should be no other user-visibile effects. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 6 +++ mm/mm_init.c | 1 + mm/page_alloc.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- mm/vmscan.c | 6 ++- 4 files changed, 130 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 88ac7be741ca..71d160437205 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -400,9 +400,15 @@ static inline void mminit_verify_zonelist(void) #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT #define __defermem_init __meminit #define __defer_init __meminit + +void deferred_init_memmap(int nid); #else #define __defermem_init #define __defer_init __init + +static inline void deferred_init_memmap(int nid) +{ +} #endif /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ diff --git a/mm/mm_init.c b/mm/mm_init.c index 5f420f7fafa1..28fbf87b20aa 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7af45b2e8870..c30f5a0535fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -252,6 +252,14 @@ static inline bool __defermem_init early_page_uninitialised(unsigned long pfn) return false; } +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + /* * Returns false when the remaining initialisation should be deferred until * later in the boot cycle when it can be parallelised. @@ -284,6 +292,11 @@ static inline bool early_page_uninitialised(unsigned long pfn) return false; } +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + return false; +} + static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) @@ -866,20 +879,51 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); } +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void init_reserved_page(unsigned long pfn) +{ + pg_data_t *pgdat; + int nid, zid; + + if (!early_page_uninitialised(pfn)) + return; + + nid = early_pfn_to_nid(pfn); + pgdat = NODE_DATA(nid); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) + break; + } + __init_single_pfn(pfn, zid, nid); +} +#else +static inline void init_reserved_page(unsigned long pfn) +{ +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + /* * Initialised pages do not have PageReserved set. This function is * called for each range allocated by the bootmem allocator and * marks the pages PageReserved. The remaining valid pages are later * sent to the buddy page allocator. */ -void reserve_bootmem_region(unsigned long start, unsigned long end) +void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); - for (; start_pfn < end_pfn; start_pfn++) - if (pfn_valid(start_pfn)) - SetPageReserved(pfn_to_page(start_pfn)); + for (; start_pfn < end_pfn; start_pfn++) { + if (pfn_valid(start_pfn)) { + struct page *page = pfn_to_page(start_pfn); + + init_reserved_page(start_pfn); + SetPageReserved(page); + } + } } static bool free_pages_prepare(struct page *page, unsigned int order) @@ -1017,6 +1061,74 @@ void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn, return __free_pages_boot_core(page, pfn, order); } +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +/* Initialise remaining memory on a node */ +void __defermem_init deferred_init_memmap(int nid) +{ + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long start = jiffies; + unsigned long nr_pages = 0; + unsigned long walk_start, walk_end; + int i, zid; + struct zone *zone; + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long first_init_pfn = pgdat->first_deferred_pfn; + + if (first_init_pfn == ULONG_MAX) + return; + + /* Sanity check boundaries */ + BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); + BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); + pgdat->first_deferred_pfn = ULONG_MAX; + + /* Only the highest zone is deferred so find it */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + zone = pgdat->node_zones + zid; + if (first_init_pfn < zone_end_pfn(zone)) + break; + } + + for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { + unsigned long pfn, end_pfn; + + end_pfn = min(walk_end, zone_end_pfn(zone)); + pfn = first_init_pfn; + if (pfn < walk_start) + pfn = walk_start; + if (pfn < zone->zone_start_pfn) + pfn = zone->zone_start_pfn; + + for (; pfn < end_pfn; pfn++) { + struct page *page; + + if (!pfn_valid(pfn)) + continue; + + if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) + continue; + + if (page->flags) { + VM_BUG_ON(page_zone(page) != zone); + continue; + } + + __init_single_page(page, pfn, zid, nid); + __free_pages_boot_core(page, pfn, 0); + nr_pages++; + cond_resched(); + } + first_init_pfn = max(end_pfn, first_init_pfn); + } + + /* Sanity check that the next zone really is unpopulated */ + WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); + + pr_info("kswapd %d initialised %lu pages in %ums\n", nid, nr_pages, + jiffies_to_msecs(jiffies - start)); +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) @@ -4329,6 +4441,9 @@ static void setup_zone_migrate_reserve(struct zone *zone) zone->nr_migrate_reserve_block = reserve; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone))) + return; + if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); diff --git a/mm/vmscan.c b/mm/vmscan.c index e61445dce04e..f4a487110764 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3386,7 +3386,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -static int kswapd(void *p) +static int __defermem_init kswapd(void *p) { unsigned long order, new_order; unsigned balanced_order; @@ -3421,6 +3421,8 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); + deferred_init_memmap(pgdat->node_id); + order = new_order = 0; balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; @@ -3576,7 +3578,7 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action, * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ -int kswapd_run(int nid) +int __defermem_init kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); int ret = 0; -- cgit From 54608c3f3a448f1e042f5d9f3b873cc8dc022f27 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:09 -0700 Subject: mm: meminit: minimise number of pfn->page lookups during initialisation Deferred struct page initialisation is using pfn_to_page() on every PFN unnecessarily. This patch minimises the number of lookups and scheduler checks. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c30f5a0535fd..0f770cc13450 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1091,6 +1091,7 @@ void __defermem_init deferred_init_memmap(int nid) for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { unsigned long pfn, end_pfn; + struct page *page = NULL; end_pfn = min(walk_end, zone_end_pfn(zone)); pfn = first_init_pfn; @@ -1100,13 +1101,32 @@ void __defermem_init deferred_init_memmap(int nid) pfn = zone->zone_start_pfn; for (; pfn < end_pfn; pfn++) { - struct page *page; - - if (!pfn_valid(pfn)) + if (!pfn_valid_within(pfn)) continue; - if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) + /* + * Ensure pfn_valid is checked every + * MAX_ORDER_NR_PAGES for memory holes + */ + if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if (!pfn_valid(pfn)) { + page = NULL; + continue; + } + } + + if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + page = NULL; continue; + } + + /* Minimise pfn page lookups and scheduler checks */ + if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + page++; + } else { + page = pfn_to_page(pfn); + cond_resched(); + } if (page->flags) { VM_BUG_ON(page_zone(page) != zone); @@ -1116,7 +1136,6 @@ void __defermem_init deferred_init_memmap(int nid) __init_single_page(page, pfn, zid, nid); __free_pages_boot_core(page, pfn, 0); nr_pages++; - cond_resched(); } first_init_pfn = max(end_pfn, first_init_pfn); } -- cgit From a4de83dd3377eb43ad95387cc16c27a11aae2feb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:16 -0700 Subject: mm: meminit: free pages in large chunks where possible Parallel struct page frees pages one at a time. Try free pages as single large pages where possible. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0f770cc13450..e3f00f622f28 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1062,6 +1062,25 @@ void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn, } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void __defermem_init deferred_free_range(struct page *page, + unsigned long pfn, int nr_pages) +{ + int i; + + if (!page) + return; + + /* Free a large naturally-aligned chunk if possible */ + if (nr_pages == MAX_ORDER_NR_PAGES && + (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + __free_pages_boot_core(page, pfn, MAX_ORDER-1); + return; + } + + for (i = 0; i < nr_pages; i++, page++, pfn++) + __free_pages_boot_core(page, pfn, 0); +} + /* Initialise remaining memory on a node */ void __defermem_init deferred_init_memmap(int nid) { @@ -1092,6 +1111,9 @@ void __defermem_init deferred_init_memmap(int nid) for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { unsigned long pfn, end_pfn; struct page *page = NULL; + struct page *free_base_page = NULL; + unsigned long free_base_pfn = 0; + int nr_to_free = 0; end_pfn = min(walk_end, zone_end_pfn(zone)); pfn = first_init_pfn; @@ -1102,7 +1124,7 @@ void __defermem_init deferred_init_memmap(int nid) for (; pfn < end_pfn; pfn++) { if (!pfn_valid_within(pfn)) - continue; + goto free_range; /* * Ensure pfn_valid is checked every @@ -1111,32 +1133,53 @@ void __defermem_init deferred_init_memmap(int nid) if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { if (!pfn_valid(pfn)) { page = NULL; - continue; + goto free_range; } } if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { page = NULL; - continue; + goto free_range; } /* Minimise pfn page lookups and scheduler checks */ if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { page++; } else { + nr_pages += nr_to_free; + deferred_free_range(free_base_page, + free_base_pfn, nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + page = pfn_to_page(pfn); cond_resched(); } if (page->flags) { VM_BUG_ON(page_zone(page) != zone); - continue; + goto free_range; } __init_single_page(page, pfn, zid, nid); - __free_pages_boot_core(page, pfn, 0); - nr_pages++; + if (!free_base_page) { + free_base_page = page; + free_base_pfn = pfn; + nr_to_free = 0; + } + nr_to_free++; + + /* Where possible, batch up pages for a single free */ + continue; +free_range: + /* Free the current block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, + nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; } + first_init_pfn = max(end_pfn, first_init_pfn); } -- cgit From ac5d2539b2382689b1cdb90bd60dcd49f61c2773 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:20 -0700 Subject: mm: meminit: reduce number of times pageblocks are set during struct page init During parallel sturct page initialisation, ranges are checked for every PFN unnecessarily which increases boot times. This patch alters when the ranges are checked. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3f00f622f28..f1f455a69cef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -838,33 +838,12 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { - struct zone *z = &NODE_DATA(nid)->node_zones[zone]; - set_page_links(page, zone, nid, pfn); mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); page_cpupid_reset_last(page); - /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. Later some blocks near - * the start are marked MIGRATE_RESERVE by - * setup_zone_migrate_reserve() - * - * bitmap is created for zone's valid pfn range. but memmap - * can be created for invalid pages (for alignment) - * check here not to call set_pageblock_migratetype() against - * pfn out of zone. - */ - if ((z->zone_start_pfn <= pfn) - && (pfn < zone_end_pfn(z)) - && !(pfn & (pageblock_nr_pages - 1))) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - INIT_LIST_HEAD(&page->lru); #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ @@ -1073,6 +1052,7 @@ static void __defermem_init deferred_free_range(struct page *page, /* Free a large naturally-aligned chunk if possible */ if (nr_pages == MAX_ORDER_NR_PAGES && (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_boot_core(page, pfn, MAX_ORDER-1); return; } @@ -4593,7 +4573,29 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, &nr_initialised)) break; } - __init_single_pfn(pfn, zone, nid); + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. Later some blocks near + * the start are marked MIGRATE_RESERVE by + * setup_zone_migrate_reserve() + * + * bitmap is created for zone's valid pfn range. but memmap + * can be created for invalid pages (for alignment) + * check here not to call set_pageblock_migratetype() against + * pfn out of zone. + */ + if (!(pfn & (pageblock_nr_pages - 1))) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone, nid); + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } else { + __init_single_pfn(pfn, zone, nid); + } } } -- cgit From 74033a798f5a5db368126ee6f690111cf019bf7a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:23 -0700 Subject: mm: meminit: remove mminit_verify_page_links mminit_verify_page_links() is an extremely paranoid check that was introduced when memory initialisation was being heavily reworked. Profiles indicated that up to 10% of parallel memory initialisation was spent on checking this for every page. The cost could be reduced but in practice this check only found problems very early during the initialisation rewrite and has found nothing since. This patch removes an expensive unnecessary check. Signed-off-by: Mel Gorman Tested-by: Nate Zimmer Tested-by: Waiman Long Tested-by: Daniel J Blueman Acked-by: Pekka Enberg Cc: Robin Holt Cc: Nate Zimmer Cc: Dave Hansen Cc: Waiman Long Cc: Scott Norton Cc: "Luck, Tony" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 8 -------- mm/mm_init.c | 8 -------- mm/page_alloc.c | 1 - 3 files changed, 17 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 71d160437205..a48cbefde8ca 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -362,10 +362,7 @@ do { \ } while (0) extern void mminit_verify_pageflags_layout(void); -extern void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn); extern void mminit_verify_zonelist(void); - #else static inline void mminit_dprintk(enum mminit_level level, @@ -377,11 +374,6 @@ static inline void mminit_verify_pageflags_layout(void) { } -static inline void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn) -{ -} - static inline void mminit_verify_zonelist(void) { } diff --git a/mm/mm_init.c b/mm/mm_init.c index 28fbf87b20aa..fdadf918de76 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -131,14 +131,6 @@ void __init mminit_verify_pageflags_layout(void) BUG_ON(or_mask != add_mask); } -void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, - unsigned long nid, unsigned long pfn) -{ - BUG_ON(page_to_nid(page) != nid); - BUG_ON(page_zonenum(page) != zone); - BUG_ON(page_to_pfn(page) != pfn); -} - static __init int set_mminit_loglevel(char *str) { get_option(&str, &mminit_loglevel); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f1f455a69cef..5a38e39b30d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -839,7 +839,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { set_page_links(page, zone, nid, pfn); - mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); page_cpupid_reset_last(page); -- cgit From 0e1cc95b4cc7293bb7b39175035e7f7e45c90977 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 30 Jun 2015 14:57:27 -0700 Subject: mm: meminit: finish initialisation of struct pages before basic setup Waiman Long reported that 24TB machines hit OOM during basic setup when struct page initialisation was deferred. One approach is to initialise memory on demand but it interferes with page allocator paths. This patch creates dedicated threads to initialise memory before basic setup. It then blocks on a rw_semaphore until completion as a wait_queue and counter is overkill. This may be slower to boot but it's simplier overall and also gets rid of a section mangling which existed so kswapd could do the initialisation. [akpm@linux-foundation.org: include rwsem.h, use DECLARE_RWSEM, fix comment, remove unneeded cast] Signed-off-by: Mel Gorman Cc: Waiman Long Cc: Dave Hansen Cc: Scott Norton Tested-by: Daniel J Blueman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 24 ------------------------ mm/page_alloc.c | 46 +++++++++++++++++++++++++++++++++++++--------- mm/vmscan.c | 6 ++---- 3 files changed, 39 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index a48cbefde8ca..36b23f1e2ca6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -379,30 +379,6 @@ static inline void mminit_verify_zonelist(void) } #endif /* CONFIG_DEBUG_MEMORY_INIT */ -/* - * Deferred struct page initialisation requires init functions that are freed - * before kswapd is available. Reuse the memory hotplug section annotation - * to mark the required code. - * - * __defermem_init is code that always exists but is annotated __meminit to - * avoid section warnings. - * __defer_init code gets marked __meminit when deferring struct page - * initialistion but is otherwise in the init section. - */ -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -#define __defermem_init __meminit -#define __defer_init __meminit - -void deferred_init_memmap(int nid); -#else -#define __defermem_init -#define __defer_init __init - -static inline void deferred_init_memmap(int nid) -{ -} -#endif - /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ #if defined(CONFIG_SPARSEMEM) extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a38e39b30d1..506eac8b38af 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -61,6 +62,7 @@ #include #include #include +#include #include #include @@ -242,7 +244,7 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) } /* Returns true if the struct page for the pfn is uninitialised */ -static inline bool __defermem_init early_page_uninitialised(unsigned long pfn) +static inline bool __meminit early_page_uninitialised(unsigned long pfn) { int nid = early_pfn_to_nid(pfn); @@ -958,7 +960,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __defer_init __free_pages_boot_core(struct page *page, +static void __init __free_pages_boot_core(struct page *page, unsigned long pfn, unsigned int order) { unsigned int nr_pages = 1 << order; @@ -1031,7 +1033,7 @@ static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, #endif -void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn, +void __init __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order) { if (early_page_uninitialised(pfn)) @@ -1040,7 +1042,7 @@ void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn, } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void __defermem_init deferred_free_range(struct page *page, +static void __init deferred_free_range(struct page *page, unsigned long pfn, int nr_pages) { int i; @@ -1060,20 +1062,30 @@ static void __defermem_init deferred_free_range(struct page *page, __free_pages_boot_core(page, pfn, 0); } +static __initdata DECLARE_RWSEM(pgdat_init_rwsem); + /* Initialise remaining memory on a node */ -void __defermem_init deferred_init_memmap(int nid) +static int __init deferred_init_memmap(void *data) { + pg_data_t *pgdat = data; + int nid = pgdat->node_id; struct mminit_pfnnid_cache nid_init_state = { }; unsigned long start = jiffies; unsigned long nr_pages = 0; unsigned long walk_start, walk_end; int i, zid; struct zone *zone; - pg_data_t *pgdat = NODE_DATA(nid); unsigned long first_init_pfn = pgdat->first_deferred_pfn; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - if (first_init_pfn == ULONG_MAX) - return; + if (first_init_pfn == ULONG_MAX) { + up_read(&pgdat_init_rwsem); + return 0; + } + + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); /* Sanity check boundaries */ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); @@ -1165,8 +1177,24 @@ free_range: /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("kswapd %d initialised %lu pages in %ums\n", nid, nr_pages, + pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, jiffies_to_msecs(jiffies - start)); + up_read(&pgdat_init_rwsem); + return 0; +} + +void __init page_alloc_init_late(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + down_read(&pgdat_init_rwsem); + kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); + } + + /* Block until all are initialised */ + down_write(&pgdat_init_rwsem); + up_write(&pgdat_init_rwsem); } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ diff --git a/mm/vmscan.c b/mm/vmscan.c index f4a487110764..e61445dce04e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3386,7 +3386,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -static int __defermem_init kswapd(void *p) +static int kswapd(void *p) { unsigned long order, new_order; unsigned balanced_order; @@ -3421,8 +3421,6 @@ static int __defermem_init kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - deferred_init_memmap(pgdat->node_id); - order = new_order = 0; balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; @@ -3578,7 +3576,7 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action, * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ -int __defermem_init kswapd_run(int nid) +int kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); int ret = 0; -- cgit