diff options
Diffstat (limited to 'mm/sparse.c')
| -rw-r--r-- | mm/sparse.c | 971 |
1 files changed, 533 insertions, 438 deletions
diff --git a/mm/sparse.c b/mm/sparse.c index 7b4be3fd5cac..b5b2b6f7041b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -1,20 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 /* * sparse memory mappings. */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/mmzone.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/highmem.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> - +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/bootmem_info.h> +#include <linux/vmstat.h> #include "internal.h" #include <asm/dma.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> /* * Permanent SPARSEMEM data: @@ -22,8 +24,7 @@ * 1) mem_section - memory sections, mem_map's for valid memory */ #ifdef CONFIG_SPARSEMEM_EXTREME -struct mem_section *mem_section[NR_SECTION_ROOTS] - ____cacheline_internodealigned_in_smp; +struct mem_section **mem_section; #else struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] ____cacheline_internodealigned_in_smp; @@ -42,11 +43,11 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -int page_to_nid(const struct page *page) +int memdesc_nid(memdesc_flags_t mdf) { - return section_to_node_table[page_to_section(page)]; + return section_to_node_table[memdesc_section(mdf)]; } -EXPORT_SYMBOL(page_to_nid); +EXPORT_SYMBOL(memdesc_nid); static void set_section_nid(unsigned long section_nr, int nid) { @@ -66,12 +67,13 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) sizeof(struct mem_section); if (slab_is_available()) { - if (node_state(nid, N_HIGH_MEMORY)) - section = kzalloc_node(array_size, GFP_KERNEL, nid); - else - section = kzalloc(array_size, GFP_KERNEL); + section = kzalloc_node(array_size, GFP_KERNEL, nid); } else { - section = memblock_virt_alloc_node(array_size, nid); + section = memblock_alloc_node(array_size, SMP_CACHE_BYTES, + nid); + if (!section) + panic("%s: Failed to allocate %lu bytes nid=%d\n", + __func__, array_size, nid); } return section; @@ -82,8 +84,15 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; + /* + * An existing section is possible in the sub-section hotplug + * case. First hot-add instantiates, follow-on hot-add reuses + * the existing section. + * + * The mem_hotplug_lock resolves the apparent race below. + */ if (mem_section[root]) - return -EEXIST; + return 0; section = sparse_index_alloc(nid); if (!section) @@ -100,32 +109,6 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) } #endif -#ifdef CONFIG_SPARSEMEM_EXTREME -int __section_nr(struct mem_section* ms) -{ - unsigned long root_nr; - struct mem_section* root; - - for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) { - root = __nr_to_section(root_nr * SECTIONS_PER_ROOT); - if (!root) - continue; - - if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) - break; - } - - VM_BUG_ON(root_nr == NR_SECTION_ROOTS); - - return (root_nr * SECTIONS_PER_ROOT) + (ms - root); -} -#else -int __section_nr(struct mem_section* ms) -{ - return (int)(ms - mem_section[0]); -} -#endif - /* * During early boot, before section_mem_map is used for an actual * mem_map, we use section_mem_map to store the section's NUMA @@ -134,7 +117,7 @@ int __section_nr(struct mem_section* ms) */ static inline unsigned long sparse_encode_early_nid(int nid) { - return (nid << SECTION_NID_SHIFT); + return ((unsigned long)nid << SECTION_NID_SHIFT); } static inline int sparse_early_nid(struct mem_section *section) @@ -143,10 +126,10 @@ static inline int sparse_early_nid(struct mem_section *section) } /* Validate the physical addressing limitations of the model */ -void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, +static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, unsigned long *end_pfn) { - unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); + unsigned long max_sparsemem_pfn = (DIRECT_MAP_PHYSMEM_END + 1) >> PAGE_SHIFT; /* * Sanity checks - do not allow an architecture to pass @@ -177,78 +160,104 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, * Keeping track of this gives us an easy way to break out of * those loops early. */ -int __highest_present_section_nr; -static void section_mark_present(struct mem_section *ms) +unsigned long __highest_present_section_nr; +static void __section_mark_present(struct mem_section *ms, + unsigned long section_nr) { - int section_nr = __section_nr(ms); - if (section_nr > __highest_present_section_nr) __highest_present_section_nr = section_nr; ms->section_mem_map |= SECTION_MARKED_PRESENT; } -static inline int next_present_section_nr(int section_nr) +static inline unsigned long first_present_section_nr(void) +{ + return next_present_section_nr(-1); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static void subsection_mask_set(unsigned long *map, unsigned long pfn, + unsigned long nr_pages) { - do { - section_nr++; - if (present_section_nr(section_nr)) - return section_nr; - } while ((section_nr < NR_MEM_SECTIONS) && - (section_nr <= __highest_present_section_nr)); + int idx = subsection_map_index(pfn); + int end = subsection_map_index(pfn + nr_pages - 1); - return -1; + bitmap_set(map, idx, end - idx + 1); } -#define for_each_present_section_nr(start, section_nr) \ - for (section_nr = next_present_section_nr(start-1); \ - ((section_nr >= 0) && \ - (section_nr < NR_MEM_SECTIONS) && \ - (section_nr <= __highest_present_section_nr)); \ - section_nr = next_present_section_nr(section_nr)) + +void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +{ + int end_sec_nr = pfn_to_section_nr(pfn + nr_pages - 1); + unsigned long nr, start_sec_nr = pfn_to_section_nr(pfn); + + for (nr = start_sec_nr; nr <= end_sec_nr; nr++) { + struct mem_section *ms; + unsigned long pfns; + + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + ms = __nr_to_section(nr); + subsection_mask_set(ms->usage->subsection_map, pfn, pfns); + + pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr, + pfns, subsection_map_index(pfn), + subsection_map_index(pfn + pfns - 1)); + + pfn += pfns; + nr_pages -= pfns; + } +} +#else +void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +{ +} +#endif /* Record a memory area against a node. */ -void __init memory_present(int nid, unsigned long start, unsigned long end) +static void __init memory_present(int nid, unsigned long start, unsigned long end) { unsigned long pfn; start &= PAGE_SECTION_MASK; mminit_validate_memmodel_limits(&start, &end); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { - unsigned long section = pfn_to_section_nr(pfn); + unsigned long section_nr = pfn_to_section_nr(pfn); struct mem_section *ms; - sparse_index_init(section, nid); - set_section_nid(section, nid); + sparse_index_init(section_nr, nid); + set_section_nid(section_nr, nid); - ms = __nr_to_section(section); + ms = __nr_to_section(section_nr); if (!ms->section_mem_map) { ms->section_mem_map = sparse_encode_early_nid(nid) | SECTION_IS_ONLINE; - section_mark_present(ms); + __section_mark_present(ms, section_nr); } } } /* - * Only used by the i386 NUMA architecures, but relatively - * generic code. + * Mark all memblocks as present using memory_present(). + * This is a convenience function that is useful to mark all of the systems + * memory as present during initialization. */ -unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, - unsigned long end_pfn) +static void __init memblocks_present(void) { - unsigned long pfn; - unsigned long nr_pages = 0; + unsigned long start, end; + int i, nid; - mminit_validate_memmodel_limits(&start_pfn, &end_pfn); - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - if (nid != early_pfn_to_nid(pfn)) - continue; +#ifdef CONFIG_SPARSEMEM_EXTREME + if (unlikely(!mem_section)) { + unsigned long size, align; - if (pfn_present(pfn)) - nr_pages += PAGES_PER_SECTION; + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_alloc_or_panic(size, align); } +#endif - return nr_pages * sizeof(struct page); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) + memory_present(nid, start, end); } /* @@ -258,9 +267,14 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, */ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) { - return (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + unsigned long coded_mem_map = + (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT); + BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); + return coded_mem_map; } +#ifdef CONFIG_MEMORY_HOTPLUG /* * Decode mem_map from the coded memmap */ @@ -270,41 +284,45 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn coded_mem_map &= SECTION_MAP_MASK; return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); } +#endif /* CONFIG_MEMORY_HOTPLUG */ -static int __meminit sparse_init_one_section(struct mem_section *ms, +static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, - unsigned long *pageblock_bitmap) + struct mem_section_usage *usage, unsigned long flags) { - if (!present_section(ms)) - return -EINVAL; - ms->section_mem_map &= ~SECTION_MAP_MASK; - ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | - SECTION_HAS_MEM_MAP; - ms->pageblock_flags = pageblock_bitmap; - - return 1; + ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) + | SECTION_HAS_MEM_MAP | flags; + ms->usage = usage; } -unsigned long usemap_size(void) +static unsigned long usemap_size(void) { return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); } -#ifdef CONFIG_MEMORY_HOTPLUG -static unsigned long *__kmalloc_section_usemap(void) +size_t mem_section_usage_size(void) { - return kmalloc(usemap_size(), GFP_KERNEL); + return sizeof(struct mem_section_usage) + usemap_size(); } -#endif /* CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_MEMORY_HOTREMOVE -static unsigned long * __init +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) +{ +#ifndef CONFIG_NUMA + VM_BUG_ON(pgdat != &contig_page_data); + return __pa_symbol(&contig_page_data); +#else + return __pa(pgdat); +#endif +} + +static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { + struct mem_section_usage *usage; unsigned long goal, limit; - unsigned long *p; int nid; /* * A page may contain usemaps for other sections preventing the @@ -316,30 +334,35 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * from the same section as the pgdat where possible to avoid * this problem. */ - goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = memblock_virt_alloc_try_nid_nopanic(size, - SMP_CACHE_BYTES, goal, limit, - nid); - if (!p && limit) { - limit = 0; + usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid); + if (!usage && limit) { + limit = MEMBLOCK_ALLOC_ACCESSIBLE; goto again; } - return p; + return usage; } -static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +static void __init check_usemap_section_nr(int nid, + struct mem_section_usage *usage) { unsigned long usemap_snr, pgdat_snr; - static unsigned long old_usemap_snr = NR_MEM_SECTIONS; - static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; + static unsigned long old_usemap_snr; + static unsigned long old_pgdat_snr; struct pglist_data *pgdat = NODE_DATA(nid); int usemap_nid; - usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + /* First call */ + if (!old_usemap_snr) { + old_usemap_snr = NR_MEM_SECTIONS; + old_pgdat_snr = NR_MEM_SECTIONS; + } + + usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT); + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT); if (usemap_snr == pgdat_snr) return; @@ -366,186 +389,202 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) usemap_snr, pgdat_snr, nid); } #else -static unsigned long * __init +static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - return memblock_virt_alloc_node_nopanic(size, pgdat->node_id); + return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id); } -static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +static void __init check_usemap_section_nr(int nid, + struct mem_section_usage *usage) { } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static void __init sparse_early_usemaps_alloc_node(void *data, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long usemap_count, int nodeid) +#ifdef CONFIG_SPARSEMEM_VMEMMAP +unsigned long __init section_map_size(void) { - void *usemap; - unsigned long pnum; - unsigned long **usemap_map = (unsigned long **)data; - int size = usemap_size(); - - usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), - size * usemap_count); - if (!usemap) { - pr_warn("%s: allocation failed\n", __func__); - return; - } + return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); +} - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - usemap_map[pnum] = usemap; - usemap += size; - check_usemap_section_nr(nodeid, usemap_map[pnum]); - } +#else +unsigned long __init section_map_size(void) +{ + return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); } -#ifndef CONFIG_SPARSEMEM_VMEMMAP -struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) +struct page __init *__populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { - struct page *map; - unsigned long size; + unsigned long size = section_map_size(); + struct page *map = sparse_buffer_alloc(size); + phys_addr_t addr = __pa(MAX_DMA_ADDRESS); - map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); if (map) return map; - size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); - map = memblock_virt_alloc_try_nid(size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); + map = memmap_alloc(size, size, addr, nid, false); + if (!map) + panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", + __func__, size, PAGE_SIZE, nid, &addr); + return map; } -void __init sparse_mem_maps_populate_node(struct page **map_map, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +static void *sparsemap_buf __meminitdata; +static void *sparsemap_buf_end __meminitdata; + +static inline void __meminit sparse_buffer_free(unsigned long size) { - void *map; - unsigned long pnum; - unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; - - map = alloc_remap(nodeid, size * map_count); - if (map) { - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - map_map[pnum] = map; - map += size; - } - return; - } + WARN_ON(!sparsemap_buf || size == 0); + memblock_free(sparsemap_buf, size); +} - size = PAGE_ALIGN(size); - map = memblock_virt_alloc_try_nid(size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nodeid); - if (map) { - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - if (!present_section_nr(pnum)) - continue; - map_map[pnum] = map; - map += size; - } - return; - } +static void __init sparse_buffer_init(unsigned long size, int nid) +{ + phys_addr_t addr = __pa(MAX_DMA_ADDRESS); + WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ + /* + * Pre-allocated buffer is mainly used by __populate_section_memmap + * and we want it to be properly aligned to the section size - this is + * especially the case for VMEMMAP which maps memmap to PMDs + */ + sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); + sparsemap_buf_end = sparsemap_buf + size; +} - /* fallback */ - for (pnum = pnum_begin; pnum < pnum_end; pnum++) { - struct mem_section *ms; +static void __init sparse_buffer_fini(void) +{ + unsigned long size = sparsemap_buf_end - sparsemap_buf; - if (!present_section_nr(pnum)) - continue; - map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); - if (map_map[pnum]) - continue; - ms = __nr_to_section(pnum); - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - ms->section_mem_map = 0; + if (sparsemap_buf && size > 0) + sparse_buffer_free(size); + sparsemap_buf = NULL; +} + +void * __meminit sparse_buffer_alloc(unsigned long size) +{ + void *ptr = NULL; + + if (sparsemap_buf) { + ptr = (void *) roundup((unsigned long)sparsemap_buf, size); + if (ptr + size > sparsemap_buf_end) + ptr = NULL; + else { + /* Free redundant aligned space */ + if ((unsigned long)(ptr - sparsemap_buf) > 0) + sparse_buffer_free((unsigned long)(ptr - sparsemap_buf)); + sparsemap_buf = ptr + size; + } } + return ptr; } -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER -static void __init sparse_early_mem_maps_alloc_node(void *data, - unsigned long pnum_begin, - unsigned long pnum_end, - unsigned long map_count, int nodeid) +void __weak __meminit vmemmap_populate_print_last(void) { - struct page **map_map = (struct page **)data; - sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, - map_count, nodeid); } -#else -static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) + +static void *sparse_usagebuf __meminitdata; +static void *sparse_usagebuf_end __meminitdata; + +/* + * Helper function that is used for generic section initialization, and + * can also be used by any hooks added above. + */ +void __init sparse_init_early_section(int nid, struct page *map, + unsigned long pnum, unsigned long flags) { - struct page *map; - struct mem_section *ms = __nr_to_section(pnum); - int nid = sparse_early_nid(ms); + BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end); + check_usemap_section_nr(nid, sparse_usagebuf); + sparse_init_one_section(__nr_to_section(pnum), pnum, map, + sparse_usagebuf, SECTION_IS_EARLY | flags); + sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size(); +} - map = sparse_mem_map_populate(pnum, nid); - if (map) - return map; +static int __init sparse_usage_init(int nid, unsigned long map_count) +{ + unsigned long size; - pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", - __func__); - ms->section_mem_map = 0; - return NULL; + size = mem_section_usage_size() * map_count; + sparse_usagebuf = sparse_early_usemaps_alloc_pgdat_section( + NODE_DATA(nid), size); + if (!sparse_usagebuf) { + sparse_usagebuf_end = NULL; + return -ENOMEM; + } + + sparse_usagebuf_end = sparse_usagebuf + size; + return 0; } -#endif -void __weak __meminit vmemmap_populate_print_last(void) +static void __init sparse_usage_fini(void) { + sparse_usagebuf = sparse_usagebuf_end = NULL; } -/** - * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap - * @map: usemap_map for pageblock flags or mmap_map for vmemmap +/* + * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end) + * And number of present sections in this node is map_count. */ -static void __init alloc_usemap_and_memmap(void (*alloc_func) - (void *, unsigned long, unsigned long, - unsigned long, int), void *data) +static void __init sparse_init_nid(int nid, unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count) { unsigned long pnum; - unsigned long map_count; - int nodeid_begin = 0; - unsigned long pnum_begin = 0; - - for_each_present_section_nr(0, pnum) { - struct mem_section *ms; + struct page *map; + struct mem_section *ms; - ms = __nr_to_section(pnum); - nodeid_begin = sparse_early_nid(ms); - pnum_begin = pnum; - break; + if (sparse_usage_init(nid, map_count)) { + pr_err("%s: node[%d] usemap allocation failed", __func__, nid); + goto failed; } - map_count = 1; - for_each_present_section_nr(pnum_begin + 1, pnum) { - struct mem_section *ms; - int nodeid; + + sparse_buffer_init(map_count * section_map_size(), nid); + + sparse_vmemmap_init_nid_early(nid); + + for_each_present_section_nr(pnum_begin, pnum) { + unsigned long pfn = section_nr_to_pfn(pnum); + + if (pnum >= pnum_end) + break; ms = __nr_to_section(pnum); - nodeid = sparse_early_nid(ms); - if (nodeid == nodeid_begin) { - map_count++; - continue; + if (!preinited_vmemmap_section(ms)) { + map = __populate_section_memmap(pfn, PAGES_PER_SECTION, + nid, NULL, NULL); + if (!map) { + pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", + __func__, nid); + pnum_begin = pnum; + sparse_usage_fini(); + sparse_buffer_fini(); + goto failed; + } + memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page), + PAGE_SIZE)); + sparse_init_early_section(nid, map, pnum, 0); } - /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - alloc_func(data, pnum_begin, pnum, - map_count, nodeid_begin); - /* new start, update count etc*/ - nodeid_begin = nodeid; - pnum_begin = pnum; - map_count = 1; } - /* ok, last chunk */ - alloc_func(data, pnum_begin, NR_MEM_SECTIONS, - map_count, nodeid_begin); + sparse_usage_fini(); + sparse_buffer_fini(); + return; +failed: + /* + * We failed to allocate, mark all the following pnums as not present, + * except the ones already initialized earlier. + */ + for_each_present_section_nr(pnum_begin, pnum) { + if (pnum >= pnum_end) + break; + ms = __nr_to_section(pnum); + if (!preinited_vmemmap_section(ms)) + ms->section_mem_map = 0; + ms->section_mem_map = 0; + } } /* @@ -554,72 +593,35 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) */ void __init sparse_init(void) { - unsigned long pnum; - struct page *map; - unsigned long *usemap; - unsigned long **usemap_map; - int size; -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - int size2; - struct page **map_map; -#endif + unsigned long pnum_end, pnum_begin, map_count = 1; + int nid_begin; /* see include/linux/mmzone.h 'struct mem_section' definition */ BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); + memblocks_present(); + + pnum_begin = first_present_section_nr(); + nid_begin = sparse_early_nid(__nr_to_section(pnum_begin)); /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ set_pageblock_order(); - /* - * map is using big page (aka 2M in x86 64 bit) - * usemap is less one page (aka 24 bytes) - * so alloc 2M (with 2M align) and 24 bytes in turn will - * make next 2M slip to one more 2M later. - * then in big system, the memory will have a lot of holes... - * here try to allocate 2M pages continuously. - * - * powerpc need to call sparse_init_one_section right after each - * sparse_early_mem_map_alloc, so allocate usemap_map at first. - */ - size = sizeof(unsigned long *) * NR_MEM_SECTIONS; - usemap_map = memblock_virt_alloc(size, 0); - if (!usemap_map) - panic("can not allocate usemap_map\n"); - alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, - (void *)usemap_map); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - size2 = sizeof(struct page *) * NR_MEM_SECTIONS; - map_map = memblock_virt_alloc(size2, 0); - if (!map_map) - panic("can not allocate map_map\n"); - alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, - (void *)map_map); -#endif - - for_each_present_section_nr(0, pnum) { - usemap = usemap_map[pnum]; - if (!usemap) - continue; + for_each_present_section_nr(pnum_begin + 1, pnum_end) { + int nid = sparse_early_nid(__nr_to_section(pnum_end)); -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - map = map_map[pnum]; -#else - map = sparse_early_mem_map_alloc(pnum); -#endif - if (!map) + if (nid == nid_begin) { + map_count++; continue; - - sparse_init_one_section(__nr_to_section(pnum), pnum, map, - usemap); + } + /* Init node with sections in range [pnum_begin, pnum_end) */ + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); + nid_begin = nid; + pnum_begin = pnum_end; + map_count = 1; } - + /* cover the last node */ + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); vmemmap_populate_print_last(); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - memblock_free_early(__pa(map_map), size2); -#endif - memblock_free_early(__pa(usemap_map), size); } #ifdef CONFIG_MEMORY_HOTPLUG @@ -630,7 +632,7 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(start_pfn); + unsigned long section_nr = pfn_to_section_nr(pfn); struct mem_section *ms; /* onlining code should never touch invalid ranges */ @@ -642,14 +644,13 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } } -#ifdef CONFIG_MEMORY_HOTREMOVE -/* Mark all memory sections within the pfn range as online */ +/* Mark all memory sections within the pfn range as offline */ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(start_pfn); + unsigned long section_nr = pfn_to_section_nr(pfn); struct mem_section *ms; /* @@ -663,83 +664,110 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) ms->section_mem_map &= ~SECTION_IS_ONLINE; } } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) +static struct page * __meminit populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { - /* This will make the necessary allocations eventually. */ - return sparse_mem_map_populate(pnum, nid); + return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); } -static void __kfree_section_memmap(struct page *memmap) + +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) { - unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + unsigned long start = (unsigned long) pfn_to_page(pfn); + unsigned long end = start + nr_pages * sizeof(struct page); - vmemmap_free(start, end); + vmemmap_free(start, end, altmap); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); - vmemmap_free(start, end); + vmemmap_free(start, end, NULL); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ -#else -static struct page *__kmalloc_section_memmap(void) -{ - struct page *page, *ret; - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); - if (page) - goto got_map_page; +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + unsigned long *subsection_map = ms->usage + ? &ms->usage->subsection_map[0] : NULL; - ret = vmalloc(memmap_size); - if (ret) - goto got_map_ptr; + subsection_mask_set(map, pfn, nr_pages); + if (subsection_map) + bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); - return NULL; -got_map_page: - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); -got_map_ptr: + if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), + "section already deactivated (%#lx + %ld)\n", + pfn, nr_pages)) + return -EINVAL; - return ret; + bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); + return 0; } -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) +static bool is_subsection_map_empty(struct mem_section *ms) { - return __kmalloc_section_memmap(); + return bitmap_empty(&ms->usage->subsection_map[0], + SUBSECTIONS_PER_SECTION); } -static void __kfree_section_memmap(struct page *memmap) +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) { - if (is_vmalloc_addr(memmap)) - vfree(memmap); + struct mem_section *ms = __pfn_to_section(pfn); + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + unsigned long *subsection_map; + int rc = 0; + + subsection_mask_set(map, pfn, nr_pages); + + subsection_map = &ms->usage->subsection_map[0]; + + if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) + rc = -EINVAL; + else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) + rc = -EEXIST; else - free_pages((unsigned long)memmap, - get_order(sizeof(struct page) * PAGES_PER_SECTION)); + bitmap_or(subsection_map, map, subsection_map, + SUBSECTIONS_PER_SECTION); + + return rc; +} +#else +static struct page * __meminit populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + return kvmalloc_node(array_size(sizeof(struct page), + PAGES_PER_SECTION), GFP_KERNEL, nid); +} + +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + kvfree(pfn_to_page(pfn)); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap) { unsigned long maps_section_nr, removing_section_nr, i; - unsigned long magic, nr_pages; + unsigned long type, nr_pages; struct page *page = virt_to_page(memmap); nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) >> PAGE_SHIFT; for (i = 0; i < nr_pages; i++, page++) { - magic = (unsigned long) page->freelist; + type = bootmem_type(page); - BUG_ON(magic == NODE_INFO); + BUG_ON(type == NODE_INFO); maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); - removing_section_nr = page_private(page); + removing_section_nr = bootmem_info(page); /* * When this function is called, the removing section is @@ -753,131 +781,198 @@ static void free_map_bootmem(struct page *memmap) put_page_bootmem(page); } } -#endif /* CONFIG_MEMORY_HOTREMOVE */ + +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return true; +} + +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + return 0; +} #endif /* CONFIG_SPARSEMEM_VMEMMAP */ /* - * returns the number of sections whose mem_maps were properly - * set. If this is <=0, then that means that the passed-in - * map was not consumed and must be freed. + * To deactivate a memory region, there are 3 cases to handle across + * two configurations (SPARSEMEM_VMEMMAP={y,n}): + * + * 1. deactivation of a partial hot-added section (only possible in + * the SPARSEMEM_VMEMMAP=y case). + * a) section was present at memory init. + * b) section was hot-added post memory init. + * 2. deactivation of a complete hot-added section. + * 3. deactivation of a complete section from memory init. + * + * For 1, when subsection_map does not empty we will not be freeing the + * usage map, but still need to free the vmemmap range. + * + * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified */ -int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) +static void section_deactivate(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) { - unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct mem_section *ms; - struct page *memmap; - unsigned long *usemap; - unsigned long flags; - int ret; + struct mem_section *ms = __pfn_to_section(pfn); + bool section_is_early = early_section(ms); + struct page *memmap = NULL; + bool empty; - /* - * no locking for this, because it does its own - * plus, it does a kmalloc - */ - ret = sparse_index_init(section_nr, pgdat->node_id); - if (ret < 0 && ret != -EEXIST) - return ret; - memmap = kmalloc_section_memmap(section_nr, pgdat->node_id); - if (!memmap) - return -ENOMEM; - usemap = __kmalloc_section_usemap(); - if (!usemap) { - __kfree_section_memmap(memmap); - return -ENOMEM; - } + if (clear_subsection_map(pfn, nr_pages)) + return; - pgdat_resize_lock(pgdat, &flags); + empty = is_subsection_map_empty(ms); + if (empty) { + unsigned long section_nr = pfn_to_section_nr(pfn); - ms = __pfn_to_section(start_pfn); - if (ms->section_mem_map & SECTION_MARKED_PRESENT) { - ret = -EEXIST; - goto out; + /* + * Mark the section invalid so that valid_section() + * return false. This prevents code from dereferencing + * ms->usage array. + */ + ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; + + /* + * When removing an early section, the usage map is kept (as the + * usage maps of other sections fall into the same page). It + * will be re-used when re-adding the section - which is then no + * longer an early section. If the usage map is PageReserved, it + * was allocated during boot. + */ + if (!PageReserved(virt_to_page(ms->usage))) { + kfree_rcu(ms->usage, rcu); + WRITE_ONCE(ms->usage, NULL); + } + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); } - memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); + /* + * The memmap of early sections is always fully populated. See + * section_activate() and pfn_valid() . + */ + if (!section_is_early) { + memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); + depopulate_section_memmap(pfn, nr_pages, altmap); + } else if (memmap) { + memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), + PAGE_SIZE))); + free_map_bootmem(memmap); + } - section_mark_present(ms); + if (empty) + ms->section_mem_map = (unsigned long)NULL; +} - ret = sparse_init_one_section(ms, section_nr, memmap, usemap); +static struct page * __meminit section_activate(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + struct mem_section_usage *usage = NULL; + struct page *memmap; + int rc; -out: - pgdat_resize_unlock(pgdat, &flags); - if (ret <= 0) { - kfree(usemap); - __kfree_section_memmap(memmap); + if (!ms->usage) { + usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); + if (!usage) + return ERR_PTR(-ENOMEM); + ms->usage = usage; } - return ret; -} -#ifdef CONFIG_MEMORY_HOTREMOVE -#ifdef CONFIG_MEMORY_FAILURE -static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ - int i; + rc = fill_subsection_map(pfn, nr_pages); + if (rc) { + if (usage) + ms->usage = NULL; + kfree(usage); + return ERR_PTR(rc); + } - if (!memmap) - return; + /* + * The early init code does not consider partially populated + * initial sections, it simply assumes that memory will never be + * referenced. If we hot-add memory into such a section then we + * do not need to populate the memmap and can simply reuse what + * is already there. + */ + if (nr_pages < PAGES_PER_SECTION && early_section(ms)) + return pfn_to_page(pfn); - for (i = 0; i < nr_pages; i++) { - if (PageHWPoison(&memmap[i])) { - atomic_long_sub(1, &num_poisoned_pages); - ClearPageHWPoison(&memmap[i]); - } + memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); + if (!memmap) { + section_deactivate(pfn, nr_pages, altmap); + return ERR_PTR(-ENOMEM); } + memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); + + return memmap; } -#else -static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ -} -#endif -static void free_section_usemap(struct page *memmap, unsigned long *usemap) +/** + * sparse_add_section - add a memory section, or populate an existing one + * @nid: The node to add section on + * @start_pfn: start pfn of the memory range + * @nr_pages: number of pfns to add in the section + * @altmap: alternate pfns to allocate the memmap backing store + * @pgmap: alternate compound page geometry for devmap mappings + * + * This is only intended for hotplug. + * + * Note that only VMEMMAP supports sub-section aligned hotplug, + * the proper alignment and size are gated by check_pfn_span(). + * + * + * Return: + * * 0 - On success. + * * -EEXIST - Section has been present. + * * -ENOMEM - Out of memory. + */ +int __meminit sparse_add_section(int nid, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { - struct page *usemap_page; + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + struct page *memmap; + int ret; - if (!usemap) - return; + ret = sparse_index_init(section_nr, nid); + if (ret < 0) + return ret; - usemap_page = virt_to_page(usemap); - /* - * Check to see if allocation came from hot-plug-add - */ - if (PageSlab(usemap_page) || PageCompound(usemap_page)) { - kfree(usemap); - if (memmap) - __kfree_section_memmap(memmap); - return; - } + memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap); + if (IS_ERR(memmap)) + return PTR_ERR(memmap); /* - * The usemap came from bootmem. This is packed with other usemaps - * on the section which has pgdat at boot time. Just keep it as is now. + * Poison uninitialized struct pages in order to catch invalid flags + * combinations. */ + page_init_poison(memmap, sizeof(struct page) * nr_pages); - if (memmap) - free_map_bootmem(memmap); + ms = __nr_to_section(section_nr); + set_section_nid(section_nr, nid); + __section_mark_present(ms, section_nr); + + /* Align memmap to section boundary in the subsection case */ + if (section_nr_to_pfn(section_nr) != start_pfn) + memmap = pfn_to_page(section_nr_to_pfn(section_nr)); + sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); + + return 0; } -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, - unsigned long map_offset) +void sparse_remove_section(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) { - struct page *memmap = NULL; - unsigned long *usemap = NULL, flags; - struct pglist_data *pgdat = zone->zone_pgdat; - - pgdat_resize_lock(pgdat, &flags); - if (ms->section_mem_map) { - usemap = ms->pageblock_flags; - memmap = sparse_decode_mem_map(ms->section_mem_map, - __section_nr(ms)); - ms->section_mem_map = 0; - ms->pageblock_flags = NULL; - } - pgdat_resize_unlock(pgdat, &flags); + struct mem_section *ms = __pfn_to_section(pfn); + + if (WARN_ON_ONCE(!valid_section(ms))) + return; - clear_hwpoisoned_pages(memmap + map_offset, - PAGES_PER_SECTION - map_offset); - free_section_usemap(memmap, usemap); + section_deactivate(pfn, nr_pages, altmap); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_MEMORY_HOTPLUG */ |
