diff options
Diffstat (limited to 'mm/memblock.c')
| -rw-r--r-- | mm/memblock.c | 783 |
1 files changed, 656 insertions, 127 deletions
diff --git a/mm/memblock.c b/mm/memblock.c index f9e61e565a53..905d06b16348 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -16,6 +16,12 @@ #include <linux/kmemleak.h> #include <linux/seq_file.h> #include <linux/memblock.h> +#include <linux/mutex.h> + +#ifdef CONFIG_KEXEC_HANDOVER +#include <linux/libfdt.h> +#include <linux/kexec_handover.h> +#endif /* CONFIG_KEXEC_HANDOVER */ #include <asm/sections.h> #include <linux/io.h> @@ -106,6 +112,13 @@ unsigned long min_low_pfn; unsigned long max_pfn; unsigned long long max_possible_pfn; +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +/* When set to true, only allocate from MEMBLOCK_KHO_SCRATCH ranges */ +static bool kho_scratch_only; +#else +#define kho_scratch_only false +#endif + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -114,12 +127,10 @@ static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, - .memory.cnt = 1, /* empty dummy entry */ .memory.max = INIT_MEMBLOCK_MEMORY_REGIONS, .memory.name = "memory", .reserved.regions = memblock_reserved_init_regions, - .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS, .reserved.name = "reserved", @@ -130,7 +141,6 @@ struct memblock memblock __initdata_memblock = { #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP struct memblock_type physmem = { .regions = memblock_physmem_init_regions, - .cnt = 1, /* empty dummy entry */ .max = INIT_PHYSMEM_REGIONS, .name = "physmem", }; @@ -161,8 +171,17 @@ static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock; static int memblock_reserved_in_slab __initdata_memblock; +bool __init_memblock memblock_has_mirror(void) +{ + return system_has_some_mirror; +} + static enum memblock_flags __init_memblock choose_memblock_flags(void) { + /* skip non-scratch memory for kho early boot allocations */ + if (kho_scratch_only) + return MEMBLOCK_KHO_SCRATCH; + return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -175,8 +194,9 @@ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) /* * Address comparison utilities */ -static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, - phys_addr_t base2, phys_addr_t size2) +unsigned long __init_memblock +memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2, + phys_addr_t size2) { return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } @@ -191,8 +211,8 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, for (i = 0; i < type->cnt; i++) if (memblock_addrs_overlap(base, size, type->regions[i].base, type->regions[i].size)) - break; - return i < type->cnt; + return true; + return false; } /** @@ -350,7 +370,6 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u /* Special case for empty arrays */ if (type->cnt == 0) { WARN_ON(type->total_size != 0); - type->cnt = 1; type->regions[0].base = 0; type->regions[0].size = 0; type->regions[0].flags = 0; @@ -419,7 +438,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, * of memory that aren't suitable for allocation */ if (!memblock_can_resize) - return -1; + panic("memblock: cannot resize %s array\n", type->name); /* Calculate new doubled size */ old_size = type->max * sizeof(struct memblock_region); @@ -454,7 +473,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, min(new_area_start, memblock.current_limit), new_alloc_size, PAGE_SIZE); - new_array = addr ? __va(addr) : NULL; + if (addr) { + /* The memory may not have been accepted, yet. */ + accept_memory(addr, new_alloc_size); + + new_array = __va(addr); + } else { + new_array = NULL; + } } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", @@ -489,7 +515,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, * needn't do it */ if (!use_slab) - BUG_ON(memblock_reserve(addr, new_alloc_size)); + BUG_ON(memblock_reserve_kern(addr, new_alloc_size)); /* Update slab flag */ *in_slab = use_slab; @@ -594,12 +620,13 @@ static int __init_memblock memblock_add_range(struct memblock_type *type, /* special case for empty array */ if (type->regions[0].size == 0) { - WARN_ON(type->cnt != 1 || type->total_size); + WARN_ON(type->cnt != 0 || type->total_size); type->regions[0].base = base; type->regions[0].size = size; type->regions[0].flags = flags; memblock_set_region_node(&type->regions[0], nid); type->total_size = size; + type->cnt = 1; return 0; } @@ -638,7 +665,7 @@ repeat: #ifdef CONFIG_NUMA WARN_ON(nid != memblock_get_region_node(rgn)); #endif - WARN_ON(flags != rgn->flags); + WARN_ON(flags != MEMBLOCK_NONE && flags != rgn->flags); nr_new++; if (insert) { if (start_rgn == -1) @@ -730,6 +757,40 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) } /** + * memblock_validate_numa_coverage - check if amount of memory with + * no node ID assigned is less than a threshold + * @threshold_bytes: maximal memory size that can have unassigned node + * ID (in bytes). + * + * A buggy firmware may report memory that does not belong to any node. + * Check if amount of such memory is below @threshold_bytes. + * + * Return: true on success, false on failure. + */ +bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_bytes) +{ + unsigned long nr_pages = 0; + unsigned long start_pfn, end_pfn, mem_size_mb; + int nid, i; + + /* calculate lose page */ + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + if (!numa_valid_node(nid)) + nr_pages += end_pfn - start_pfn; + } + + if ((nr_pages << PAGE_SHIFT) > threshold_bytes) { + mem_size_mb = memblock_phys_mem_size() / SZ_1M; + pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n", + (nr_pages << PAGE_SHIFT) / SZ_1M, mem_size_mb); + return false; + } + + return true; +} + + +/** * memblock_isolate_range - isolate given range into disjoint memblocks * @type: memblock type to isolate range for * @base: base of range to isolate @@ -740,7 +801,8 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) * Walk @type and ensure that regions don't cross the boundaries defined by * [@base, @base + @size). Crossing regions are split at the boundaries, * which may create at most two more regions. The index of the first - * region inside the range is returned in *@start_rgn and end in *@end_rgn. + * region inside the range is returned in *@start_rgn and the index of the + * first region after the range is returned in *@end_rgn. * * Return: * 0 on success, -errno on failure. @@ -863,14 +925,15 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.reserved, base, size); } -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +int __init_memblock __memblock_reserve(phys_addr_t base, phys_addr_t size, + int nid, enum memblock_flags flags) { phys_addr_t end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, - &base, &end, (void *)_RET_IP_); + memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__, + &base, &end, nid, flags, (void *)_RET_IP_); - return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); + return memblock_add_range(&memblock.reserved, base, size, nid, flags); } #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP @@ -885,8 +948,43 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) } #endif +#ifdef CONFIG_MEMBLOCK_KHO_SCRATCH +__init void memblock_set_kho_scratch_only(void) +{ + kho_scratch_only = true; +} + +__init void memblock_clear_kho_scratch_only(void) +{ + kho_scratch_only = false; +} + +__init void memmap_init_kho_scratch_pages(void) +{ + phys_addr_t start, end; + unsigned long pfn; + int nid; + u64 i; + + if (!IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) + return; + + /* + * Initialize struct pages for free scratch memory. + * The struct pages for reserved scratch memory will be set up in + * reserve_bootmem_region() + */ + __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, + MEMBLOCK_KHO_SCRATCH, &start, &end, &nid) { + for (pfn = PFN_UP(start); pfn < PFN_DOWN(end); pfn++) + init_deferred_page(pfn, nid); + } +} +#endif + /** * memblock_setclr_flag - set or clear flag for a memory region + * @type: memblock type to set/clear flag for * @base: base address of the region * @size: size of the region * @set: set or clear the flag @@ -896,10 +994,9 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) * * Return: 0 on success, -errno on failure. */ -static int __init_memblock memblock_setclr_flag(phys_addr_t base, - phys_addr_t size, int set, int flag) +static int __init_memblock memblock_setclr_flag(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, int set, int flag) { - struct memblock_type *type = &memblock.memory; int i, ret, start_rgn, end_rgn; ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); @@ -928,7 +1025,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, */ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG); + return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_HOTPLUG); } /** @@ -940,7 +1037,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); + return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_HOTPLUG); } /** @@ -957,7 +1054,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) system_has_some_mirror = true; - return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); + return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_MIRROR); } /** @@ -977,7 +1074,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); + return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_NOMAP); } /** @@ -989,7 +1086,62 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); + return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_NOMAP); +} + +/** + * memblock_reserved_mark_noinit - Mark a reserved memory region with flag + * MEMBLOCK_RSRV_NOINIT + * + * @base: the base phys addr of the region + * @size: the size of the region + * + * The struct pages for the reserved regions marked %MEMBLOCK_RSRV_NOINIT will + * not be fully initialized to allow the caller optimize their initialization. + * + * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, setting this flag + * completely bypasses the initialization of struct pages for such region. + * + * When %CONFIG_DEFERRED_STRUCT_PAGE_INIT is disabled, struct pages in this + * region will be initialized with default values but won't be marked as + * reserved. + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.reserved, base, size, 1, + MEMBLOCK_RSRV_NOINIT); +} + +/** + * memblock_mark_kho_scratch - Mark a memory region as MEMBLOCK_KHO_SCRATCH. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Only memory regions marked with %MEMBLOCK_KHO_SCRATCH will be considered + * for allocations during early boot with kexec handover. + * + * Return: 0 on success, -errno on failure. + */ +__init int memblock_mark_kho_scratch(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.memory, base, size, 1, + MEMBLOCK_KHO_SCRATCH); +} + +/** + * memblock_clear_kho_scratch - Clear MEMBLOCK_KHO_SCRATCH flag for a + * specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return: 0 on success, -errno on failure. + */ +__init int memblock_clear_kho_scratch(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.memory, base, size, 0, + MEMBLOCK_KHO_SCRATCH); } static bool should_skip_region(struct memblock_type *type, @@ -1003,7 +1155,7 @@ static bool should_skip_region(struct memblock_type *type, return false; /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != m_nid) + if (numa_valid_node(nid) && nid != m_nid) return true; /* skip hotpluggable memory regions if needed */ @@ -1023,6 +1175,13 @@ static bool should_skip_region(struct memblock_type *type, if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) return true; + /* + * In early alloc during kexec handover, we can only consider + * MEMBLOCK_KHO_SCRATCH regions for the allocations + */ + if ((flags & MEMBLOCK_KHO_SCRATCH) && !memblock_is_kho_scratch(m)) + return true; + return false; } @@ -1060,10 +1219,6 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, - "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - for (; idx_a < type_a->cnt; idx_a++) { struct memblock_region *m = &type_a->regions[idx_a]; @@ -1157,9 +1312,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (*idx == (u64)ULLONG_MAX) { idx_a = type_a->cnt - 1; if (type_b != NULL) @@ -1245,7 +1397,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - if (nid == MAX_NUMNODES || nid == r_nid) + if (!numa_valid_node(nid) || nid == r_nid) break; } if (*idx >= type->cnt) { @@ -1293,70 +1445,6 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return 0; } -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -/** - * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() - * - * @idx: pointer to u64 loop variable - * @zone: zone in which all of the memory blocks reside - * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL - * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL - * - * This function is meant to be a zone/pfn specific wrapper for the - * for_each_mem_range type iterators. Specifically they are used in the - * deferred memory init routines and as such we were duplicating much of - * this logic throughout the code. So instead of having it in multiple - * locations it seemed like it would make more sense to centralize this to - * one new iterator that does everything they need. - */ -void __init_memblock -__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, - unsigned long *out_spfn, unsigned long *out_epfn) -{ - int zone_nid = zone_to_nid(zone); - phys_addr_t spa, epa; - - __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, - &memblock.memory, &memblock.reserved, - &spa, &epa, NULL); - - while (*idx != U64_MAX) { - unsigned long epfn = PFN_DOWN(epa); - unsigned long spfn = PFN_UP(spa); - - /* - * Verify the end is at least past the start of the zone and - * that we have at least one PFN to initialize. - */ - if (zone->zone_start_pfn < epfn && spfn < epfn) { - /* if we went too far just stop searching */ - if (zone_end_pfn(zone) <= spfn) { - *idx = U64_MAX; - break; - } - - if (out_spfn) - *out_spfn = max(zone->zone_start_pfn, spfn); - if (out_epfn) - *out_epfn = min(zone_end_pfn(zone), epfn); - - return; - } - - __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, - &memblock.memory, &memblock.reserved, - &spa, &epa, NULL); - } - - /* signal end of iteration */ - if (out_spfn) - *out_spfn = ULONG_MAX; - if (out_epfn) - *out_epfn = 0; -} - -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ - /** * memblock_alloc_range_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes @@ -1390,8 +1478,16 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; + /* + * Detect any accidental use of these APIs after slab is ready, as at + * this moment memblock may be deinitialized already and its + * internal data may be destroyed (after execution of memblock_free_all) + */ + if (WARN_ON_ONCE(slab_is_available())) { + void *vaddr = kzalloc_node(size, GFP_NOWAIT, nid); + + return vaddr ? virt_to_phys(vaddr) : 0; + } if (!align) { /* Can't use WARNs this early in boot on powerpc */ @@ -1402,14 +1498,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); - if (found && !memblock_reserve(found, size)) + if (found && !__memblock_reserve(found, size, nid, MEMBLOCK_RSRV_KERN)) goto done; - if (nid != NUMA_NO_NODE && !exact_nid) { + if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); - if (found && !memblock_reserve(found, size)) + if (found && !memblock_reserve_kern(found, size)) goto done; } @@ -1443,7 +1539,7 @@ done: * * Accept the memory of the allocated buffer. */ - accept_memory(found, found + size); + accept_memory(found, size); return found; } @@ -1518,13 +1614,6 @@ static void * __init memblock_alloc_internal( { phys_addr_t alloc; - /* - * Detect any accidental use of these APIs after slab is ready, as at - * this moment memblock may be deinitialized already and its - * internal data may be destroyed (after execution of memblock_free_all) - */ - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc_node(size, GFP_NOWAIT, nid); if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; @@ -1642,6 +1731,26 @@ void * __init memblock_alloc_try_nid( } /** + * __memblock_alloc_or_panic - Try to allocate memory and panic on failure + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @func: caller func name + * + * This function attempts to allocate memory using memblock_alloc, + * and in case of failure, it calls panic with the formatted message. + * This function should not be used directly, please use the macro memblock_alloc_or_panic. + */ +void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func) +{ + void *addr = memblock_alloc(size, align); + + if (unlikely(!addr)) + panic("%s: Failed to allocate %pap bytes\n", func, &size); + return addr; +} + +/** * memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes @@ -1681,6 +1790,46 @@ phys_addr_t __init_memblock memblock_reserved_size(void) return memblock.reserved.total_size; } +phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int nid) +{ + struct memblock_region *r; + phys_addr_t total = 0; + + for_each_reserved_mem_region(r) { + phys_addr_t size = r->size; + + if (r->base > limit) + break; + + if (r->base + r->size > limit) + size = limit - r->base; + + if (nid == memblock_get_region_node(r) || !numa_valid_node(nid)) + if (r->flags & MEMBLOCK_RSRV_KERN) + total += size; + } + + return total; +} + +/** + * memblock_estimated_nr_free_pages - return estimated number of free pages + * from memblock point of view + * + * During bootup, subsystems might need a rough estimate of the number of free + * pages in the whole system, before precise numbers are available from the + * buddy. Especially with CONFIG_DEFERRED_STRUCT_PAGE_INIT, the numbers + * obtained from the buddy might be very imprecise during bootup. + * + * Return: + * An estimated number of free pages from memblock point of view. + */ +unsigned long __init memblock_estimated_nr_free_pages(void) +{ + return PHYS_PFN(memblock_phys_mem_size() - + memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, NUMA_NO_NODE)); +} + /* lowest address */ phys_addr_t __init_memblock memblock_start_of_DRAM(void) { @@ -1828,7 +1977,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, int mid = memblock_search(type, PFN_PHYS(pfn)); if (mid == -1) - return -1; + return NUMA_NO_NODE; *start_pfn = PFN_DOWN(type->regions[mid].base); *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); @@ -1925,7 +2074,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) end = base + size - 1; flags = rgn->flags; #ifdef CONFIG_NUMA - if (memblock_get_region_node(rgn) != MAX_NUMNODES) + if (numa_valid_node(memblock_get_region_node(rgn))) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif @@ -1983,7 +2132,7 @@ static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn) * downwards. */ pg = PAGE_ALIGN(__pa(start_pg)); - pgend = __pa(end_pg) & PAGE_MASK; + pgend = PAGE_ALIGN_DOWN(__pa(end_pg)); /* * If there are free pages between these, free the section of the @@ -2056,12 +2205,13 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) * Free the pages in the largest chunks alignment allows. * * __ffs() behaviour is undefined for 0. start == 0 is - * MAX_ORDER-aligned, set order to MAX_ORDER for the case. + * MAX_PAGE_ORDER-aligned, set order to MAX_PAGE_ORDER for + * the case. */ if (start) - order = min_t(int, MAX_ORDER, __ffs(start)); + order = min_t(int, MAX_PAGE_ORDER, __ffs(start)); else - order = MAX_ORDER; + order = MAX_PAGE_ORDER; while (start + (1UL << order) > end) order--; @@ -2076,8 +2226,10 @@ static unsigned long __init __free_memory_core(phys_addr_t start, phys_addr_t end) { unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); + unsigned long end_pfn = PFN_DOWN(end); + + if (!IS_ENABLED(CONFIG_HIGHMEM) && end_pfn > max_low_pfn) + end_pfn = max_low_pfn; if (start_pfn >= end_pfn) return 0; @@ -2092,11 +2244,14 @@ static void __init memmap_init_reserved_pages(void) struct memblock_region *region; phys_addr_t start, end; int nid; + unsigned long max_reserved; /* * set nid on all reserved pages and also treat struct * pages for the NOMAP regions as PageReserved */ +repeat: + max_reserved = memblock.reserved.max; for_each_mem_region(region) { nid = memblock_get_region_node(region); start = region->base; @@ -2105,16 +2260,31 @@ static void __init memmap_init_reserved_pages(void) if (memblock_is_nomap(region)) reserve_bootmem_region(start, end, nid); - memblock_set_node(start, end, &memblock.reserved, nid); + memblock_set_node(start, region->size, &memblock.reserved, nid); } + /* + * 'max' is changed means memblock.reserved has been doubled its + * array, which may result a new reserved region before current + * 'start'. Now we should repeat the procedure to set its node id. + */ + if (max_reserved != memblock.reserved.max) + goto repeat; - /* initialize struct pages for the reserved regions */ + /* + * initialize struct pages for reserved regions that don't have + * the MEMBLOCK_RSRV_NOINIT flag set + */ for_each_reserved_mem_region(region) { - nid = memblock_get_region_node(region); - start = region->base; - end = start + region->size; + if (!memblock_is_reserved_noinit(region)) { + nid = memblock_get_region_node(region); + start = region->base; + end = start + region->size; + + if (!numa_valid_node(nid)) + nid = early_pfn_to_nid(PFN_DOWN(start)); - reserve_bootmem_region(start, end, nid); + reserve_bootmem_region(start, end, nid); + } } } @@ -2173,16 +2343,375 @@ void __init memblock_free_all(void) free_unused_memmap(); reset_all_zones_managed_pages(); + memblock_clear_kho_scratch_only(); pages = free_low_memory_core_early(); totalram_pages_add(pages); } +/* Keep a table to reserve named memory */ +#define RESERVE_MEM_MAX_ENTRIES 8 +#define RESERVE_MEM_NAME_SIZE 16 +struct reserve_mem_table { + char name[RESERVE_MEM_NAME_SIZE]; + phys_addr_t start; + phys_addr_t size; +}; +static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES]; +static int reserved_mem_count; +static DEFINE_MUTEX(reserve_mem_lock); + +/* Add wildcard region with a lookup name */ +static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, + const char *name) +{ + struct reserve_mem_table *map; + + map = &reserved_mem_table[reserved_mem_count++]; + map->start = start; + map->size = size; + strscpy(map->name, name); +} + +static struct reserve_mem_table *reserve_mem_find_by_name_nolock(const char *name) +{ + struct reserve_mem_table *map; + int i; + + for (i = 0; i < reserved_mem_count; i++) { + map = &reserved_mem_table[i]; + if (!map->size) + continue; + if (strcmp(name, map->name) == 0) + return map; + } + return NULL; +} + +/** + * reserve_mem_find_by_name - Find reserved memory region with a given name + * @name: The name that is attached to a reserved memory region + * @start: If found, holds the start address + * @size: If found, holds the size of the address. + * + * @start and @size are only updated if @name is found. + * + * Returns: 1 if found or 0 if not found. + */ +int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size) +{ + struct reserve_mem_table *map; + + guard(mutex)(&reserve_mem_lock); + map = reserve_mem_find_by_name_nolock(name); + if (!map) + return 0; + + *start = map->start; + *size = map->size; + return 1; +} +EXPORT_SYMBOL_GPL(reserve_mem_find_by_name); + +/** + * reserve_mem_release_by_name - Release reserved memory region with a given name + * @name: The name that is attatched to a reserved memory region + * + * Forcibly release the pages in the reserved memory region so that those memory + * can be used as free memory. After released the reserved region size becomes 0. + * + * Returns: 1 if released or 0 if not found. + */ +int reserve_mem_release_by_name(const char *name) +{ + char buf[RESERVE_MEM_NAME_SIZE + 12]; + struct reserve_mem_table *map; + void *start, *end; + + guard(mutex)(&reserve_mem_lock); + map = reserve_mem_find_by_name_nolock(name); + if (!map) + return 0; + + start = phys_to_virt(map->start); + end = start + map->size - 1; + snprintf(buf, sizeof(buf), "reserve_mem:%s", name); + free_reserved_area(start, end, 0, buf); + map->size = 0; + + return 1; +} + +#ifdef CONFIG_KEXEC_HANDOVER +#define MEMBLOCK_KHO_FDT "memblock" +#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" +#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" + +static int __init reserved_mem_preserve(void) +{ + unsigned int nr_preserved = 0; + int err; + + for (unsigned int i = 0; i < reserved_mem_count; i++, nr_preserved++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + struct page *page = phys_to_page(map->start); + unsigned int nr_pages = map->size >> PAGE_SHIFT; + + err = kho_preserve_pages(page, nr_pages); + if (err) + goto err_unpreserve; + } + + return 0; + +err_unpreserve: + for (unsigned int i = 0; i < nr_preserved; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + struct page *page = phys_to_page(map->start); + unsigned int nr_pages = map->size >> PAGE_SHIFT; + + kho_unpreserve_pages(page, nr_pages); + } + + return err; +} + +static int __init prepare_kho_fdt(void) +{ + struct page *fdt_page; + void *fdt; + int err; + + fdt_page = alloc_page(GFP_KERNEL); + if (!fdt_page) { + err = -ENOMEM; + goto err_report; + } + + fdt = page_to_virt(fdt_page); + err = kho_preserve_pages(fdt_page, 1); + if (err) + goto err_free_fdt; + + err |= fdt_create(fdt, PAGE_SIZE); + err |= fdt_finish_reservemap(fdt); + err |= fdt_begin_node(fdt, ""); + err |= fdt_property_string(fdt, "compatible", MEMBLOCK_KHO_NODE_COMPATIBLE); + + for (unsigned int i = 0; !err && i < reserved_mem_count; i++) { + struct reserve_mem_table *map = &reserved_mem_table[i]; + + err |= fdt_begin_node(fdt, map->name); + err |= fdt_property_string(fdt, "compatible", RESERVE_MEM_KHO_NODE_COMPATIBLE); + err |= fdt_property(fdt, "start", &map->start, sizeof(map->start)); + err |= fdt_property(fdt, "size", &map->size, sizeof(map->size)); + err |= fdt_end_node(fdt); + } + err |= fdt_end_node(fdt); + err |= fdt_finish(fdt); + + if (err) + goto err_unpreserve_fdt; + + err = kho_add_subtree(MEMBLOCK_KHO_FDT, fdt); + if (err) + goto err_unpreserve_fdt; + + err = reserved_mem_preserve(); + if (err) + goto err_remove_subtree; + + return 0; + +err_remove_subtree: + kho_remove_subtree(fdt); +err_unpreserve_fdt: + kho_unpreserve_pages(fdt_page, 1); +err_free_fdt: + put_page(fdt_page); +err_report: + pr_err("failed to prepare memblock FDT for KHO: %d\n", err); + + return err; +} + +static int __init reserve_mem_init(void) +{ + int err; + + if (!kho_is_enabled() || !reserved_mem_count) + return 0; + + err = prepare_kho_fdt(); + if (err) + return err; + return err; +} +late_initcall(reserve_mem_init); + +static void *__init reserve_mem_kho_retrieve_fdt(void) +{ + phys_addr_t fdt_phys; + static void *fdt; + int err; + + if (fdt) + return fdt; + + err = kho_retrieve_subtree(MEMBLOCK_KHO_FDT, &fdt_phys); + if (err) { + if (err != -ENOENT) + pr_warn("failed to retrieve FDT '%s' from KHO: %d\n", + MEMBLOCK_KHO_FDT, err); + return NULL; + } + + fdt = phys_to_virt(fdt_phys); + + err = fdt_node_check_compatible(fdt, 0, MEMBLOCK_KHO_NODE_COMPATIBLE); + if (err) { + pr_warn("FDT '%s' is incompatible with '%s': %d\n", + MEMBLOCK_KHO_FDT, MEMBLOCK_KHO_NODE_COMPATIBLE, err); + fdt = NULL; + } + + return fdt; +} + +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size, + phys_addr_t align) +{ + int err, len_start, len_size, offset; + const phys_addr_t *p_start, *p_size; + const void *fdt; + + fdt = reserve_mem_kho_retrieve_fdt(); + if (!fdt) + return false; + + offset = fdt_subnode_offset(fdt, 0, name); + if (offset < 0) { + pr_warn("FDT '%s' has no child '%s': %d\n", + MEMBLOCK_KHO_FDT, name, offset); + return false; + } + err = fdt_node_check_compatible(fdt, offset, RESERVE_MEM_KHO_NODE_COMPATIBLE); + if (err) { + pr_warn("Node '%s' is incompatible with '%s': %d\n", + name, RESERVE_MEM_KHO_NODE_COMPATIBLE, err); + return false; + } + + p_start = fdt_getprop(fdt, offset, "start", &len_start); + p_size = fdt_getprop(fdt, offset, "size", &len_size); + if (!p_start || len_start != sizeof(*p_start) || !p_size || + len_size != sizeof(*p_size)) { + return false; + } + + if (*p_start & (align - 1)) { + pr_warn("KHO reserve-mem '%s' has wrong alignment (0x%lx, 0x%lx)\n", + name, (long)align, (long)*p_start); + return false; + } + + if (*p_size != size) { + pr_warn("KHO reserve-mem '%s' has wrong size (0x%lx != 0x%lx)\n", + name, (long)*p_size, (long)size); + return false; + } + + reserved_mem_add(*p_start, size, name); + pr_info("Revived memory reservation '%s' from KHO\n", name); + + return true; +} +#else +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size, + phys_addr_t align) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER */ + +/* + * Parse reserve_mem=nn:align:name + */ +static int __init reserve_mem(char *p) +{ + phys_addr_t start, size, align, tmp; + char *name; + char *oldp; + int len; + + if (!p) + return -EINVAL; + + /* Check if there's room for more reserved memory */ + if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES) + return -EBUSY; + + oldp = p; + size = memparse(p, &p); + if (!size || p == oldp) + return -EINVAL; + + if (*p != ':') + return -EINVAL; + + align = memparse(p+1, &p); + if (*p != ':') + return -EINVAL; + + /* + * memblock_phys_alloc() doesn't like a zero size align, + * but it is OK for this command to have it. + */ + if (align < SMP_CACHE_BYTES) + align = SMP_CACHE_BYTES; + + name = p + 1; + len = strlen(name); + + /* name needs to have length but not too big */ + if (!len || len >= RESERVE_MEM_NAME_SIZE) + return -EINVAL; + + /* Make sure that name has text */ + for (p = name; *p; p++) { + if (!isspace(*p)) + break; + } + if (!*p) + return -EINVAL; + + /* Make sure the name is not already used */ + if (reserve_mem_find_by_name(name, &start, &tmp)) + return -EBUSY; + + /* Pick previous allocations up from KHO if available */ + if (reserve_mem_kho_revive(name, size, align)) + return 1; + + /* TODO: Allocation must be outside of scratch region */ + start = memblock_phys_alloc(size, align); + if (!start) + return -ENOMEM; + + reserved_mem_add(start, size, name); + + return 1; +} +__setup("reserve_mem=", reserve_mem); + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) static const char * const flagname[] = { [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG", [ilog2(MEMBLOCK_MIRROR)] = "MIRROR", [ilog2(MEMBLOCK_NOMAP)] = "NOMAP", [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG", + [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT", + [ilog2(MEMBLOCK_RSRV_KERN)] = "RSV_KERN", + [ilog2(MEMBLOCK_KHO_SCRATCH)] = "KHO_SCRATCH", }; static int memblock_debug_show(struct seq_file *m, void *private) @@ -2200,7 +2729,7 @@ static int memblock_debug_show(struct seq_file *m, void *private) seq_printf(m, "%4d: ", i); seq_printf(m, "%pa..%pa ", ®->base, &end); - if (nid != MAX_NUMNODES) + if (numa_valid_node(nid)) seq_printf(m, "%4d ", nid); else seq_printf(m, "%4c ", 'x'); |
