diff options
Diffstat (limited to 'kernel/resource.c')
-rw-r--r-- | kernel/resource.c | 927 |
1 files changed, 664 insertions, 263 deletions
diff --git a/kernel/resource.c b/kernel/resource.c index 841737bbda9e..12004452d999 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -18,12 +18,17 @@ #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/proc_fs.h> +#include <linux/pseudo_fs.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/device.h> #include <linux/pfn.h> #include <linux/mm.h> +#include <linux/mount.h> #include <linux/resource_ext.h> +#include <uapi/linux/magic.h> +#include <linux/string.h> +#include <linux/vmalloc.h> #include <asm/io.h> @@ -43,43 +48,37 @@ struct resource iomem_resource = { }; EXPORT_SYMBOL(iomem_resource); -/* constraints to be met while allocating resources */ -struct resource_constraint { - resource_size_t min, max, align; - resource_size_t (*alignf)(void *, const struct resource *, - resource_size_t, resource_size_t); - void *alignf_data; -}; - static DEFINE_RWLOCK(resource_lock); /* - * For memory hotplug, there is no way to free resource entries allocated - * by boot mem after the system is up. So for reusing the resource entry - * we need to remember the resource. + * Return the next node of @p in pre-order tree traversal. If + * @skip_children is true, skip the descendant nodes of @p in + * traversal. If @p is a descendant of @subtree_root, only traverse + * the subtree under @subtree_root. */ -static struct resource *bootmem_resource_free; -static DEFINE_SPINLOCK(bootmem_resource_lock); - -static struct resource *next_resource(struct resource *p, bool sibling_only) +static struct resource *next_resource(struct resource *p, bool skip_children, + struct resource *subtree_root) { - /* Caller wants to traverse through siblings only */ - if (sibling_only) - return p->sibling; - - if (p->child) + if (!skip_children && p->child) return p->child; - while (!p->sibling && p->parent) + while (!p->sibling && p->parent) { p = p->parent; + if (p == subtree_root) + return NULL; + } return p->sibling; } -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct resource *p = v; - (*pos)++; - return (void *)next_resource(p, false); -} +/* + * Traverse the resource subtree under @_root in pre-order, excluding + * @_root itself. + * + * NOTE: '__p' is introduced to avoid shadowing '_p' outside of loop. + * And it is referenced to avoid unused variable warning. + */ +#define for_each_resource(_root, _p, _skip_children) \ + for (typeof(_root) __root = (_root), __p = _p = __root->child; \ + __p && _p; _p = next_resource(_p, _skip_children, __root)) #ifdef CONFIG_PROC_FS @@ -88,14 +87,28 @@ enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { - struct resource *p = PDE_DATA(file_inode(m->file)); - loff_t l = 0; + struct resource *root = pde_data(file_inode(m->file)); + struct resource *p; + loff_t l = *pos; + read_lock(&resource_lock); - for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) - ; + for_each_resource(root, p, false) { + if (l-- == 0) + break; + } + return p; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + + (*pos)++; + + return (void *)next_resource(p, false, NULL); +} + static void r_stop(struct seq_file *m, void *v) __releases(resource_lock) { @@ -104,7 +117,7 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct resource *root = PDE_DATA(file_inode(m->file)); + struct resource *root = pde_data(file_inode(m->file)); struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; @@ -149,36 +162,19 @@ __initcall(ioresources_init); static void free_resource(struct resource *res) { - if (!res) - return; - - if (!PageSlab(virt_to_head_page(res))) { - spin_lock(&bootmem_resource_lock); - res->sibling = bootmem_resource_free; - bootmem_resource_free = res; - spin_unlock(&bootmem_resource_lock); - } else { + /** + * If the resource was allocated using memblock early during boot + * we'll leak it here: we can only return full pages back to the + * buddy and trying to be smart and reusing them eventually in + * alloc_resource() overcomplicates resource handling. + */ + if (res && PageSlab(virt_to_head_page(res))) kfree(res); - } } static struct resource *alloc_resource(gfp_t flags) { - struct resource *res = NULL; - - spin_lock(&bootmem_resource_lock); - if (bootmem_resource_free) { - res = bootmem_resource_free; - bootmem_resource_free = res->sibling; - } - spin_unlock(&bootmem_resource_lock); - - if (res) - memset(res, 0, sizeof(struct resource)); - else - res = kzalloc(sizeof(struct resource), flags); - - return res; + return kzalloc(sizeof(struct resource), flags); } /* Return the conflict entry if you can't request it */ @@ -319,30 +315,32 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); +static bool is_type_match(struct resource *p, unsigned long flags, unsigned long desc) +{ + return (p->flags & flags) == flags && (desc == IORES_DESC_NONE || desc == p->desc); +} + /** - * Finds the lowest iomem resource that covers part of [@start..@end]. The - * caller must specify @start, @end, @flags, and @desc (which may be - * IORES_DESC_NONE). + * find_next_iomem_res - Finds the lowest iomem resource that covers part of + * [@start..@end]. * * If a resource is found, returns 0 and @*res is overwritten with the part * of the resource that's within [@start..@end]; if none is found, returns * -ENODEV. Returns -EINVAL for invalid parameters. * - * This function walks the whole tree and not just first level children - * unless @first_lvl is true. - * * @start: start address of the resource searched for * @end: end address of same resource * @flags: flags which the resource must have * @desc: descriptor the resource must have - * @first_lvl: walk only the first level children, if set * @res: return ptr, if resource found + * + * The caller must specify @start, @end, @flags, and @desc + * (which may be IORES_DESC_NONE). */ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, - bool first_lvl, struct resource *res) + struct resource *res) { - bool siblings_only = true; struct resource *p; if (!res) @@ -353,7 +351,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) { + for_each_resource(&iomem_resource, p, false) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -364,28 +362,20 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p->end < start) continue; - /* - * Now that we found a range that matches what we look for, - * check the flags and the descriptor. If we were not asked to - * use only the first level, start looking at children as well. - */ - siblings_only = first_lvl; - - if ((p->flags & flags) != flags) - continue; - if ((desc != IORES_DESC_NONE) && (desc != p->desc)) - continue; - /* Found a match, break */ - break; + if (is_type_match(p, flags, desc)) + break; } if (p) { /* copy data */ - res->start = max(start, p->start); - res->end = min(end, p->end); - res->flags = p->flags; - res->desc = p->desc; + *res = (struct resource) { + .start = max(start, p->start), + .end = min(end, p->end), + .flags = p->flags, + .desc = p->desc, + .parent = p->parent, + }; } read_unlock(&resource_lock); @@ -394,14 +384,14 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, - bool first_lvl, void *arg, + void *arg, int (*func)(struct resource *, void *)) { struct resource res; int ret = -EINVAL; while (start < end && - !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { + !find_next_iomem_res(start, end, flags, desc, &res)) { ret = (*func)(&res, arg); if (ret) break; @@ -413,11 +403,9 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, } /** - * Walks through iomem resources and calls func() with matching resource - * ranges. This walks through whole tree and not just first level children. - * All the memory ranges which overlap start,end and also match flags and - * desc are valid candidates. - * + * walk_iomem_res_desc - Walks through iomem resources and calls func() + * with matching resource ranges. + * * * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check. * @flags: I/O resource flags * @start: start addr @@ -425,13 +413,16 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, * @arg: function argument for the callback @func * @func: callback function that is called for each qualifying resource area * + * All the memory ranges which overlap start,end and also match flags and + * desc are valid candidates. + * * NOTE: For a new descriptor search, define a new IORES_DESC in * <linux/ioport.h> and set it in 'desc' of a target resource entry. */ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { - return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func); + return __walk_iomem_res_desc(start, end, flags, desc, arg, func); } EXPORT_SYMBOL_GPL(walk_iomem_res_desc); @@ -447,8 +438,60 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, { unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, - arg, func); + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg, + func); +} + +/* + * This function, being a variant of walk_system_ram_res(), calls the @func + * callback against all memory ranges of type System RAM which are marked as + * IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY in reversed order, i.e., from + * higher to lower. + */ +int walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + struct resource res, *rams; + int rams_size = 16, i; + unsigned long flags; + int ret = -1; + + /* create a list */ + rams = kvcalloc(rams_size, sizeof(struct resource), GFP_KERNEL); + if (!rams) + return ret; + + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + i = 0; + while ((start < end) && + (!find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res))) { + if (i >= rams_size) { + /* re-alloc */ + struct resource *rams_new; + + rams_new = kvrealloc(rams, (rams_size + 16) * sizeof(struct resource), + GFP_KERNEL); + if (!rams_new) + goto out; + + rams = rams_new; + rams_size += 16; + } + + rams[i++] = res; + start = res.end + 1; + } + + /* go reverse */ + for (i--; i >= 0; i--) { + ret = (*func)(&rams[i], arg); + if (ret) + break; + } + +out: + kvfree(rams); + return ret; } /* @@ -460,17 +503,14 @@ int walk_mem_res(u64 start, u64 end, void *arg, { unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, - arg, func); + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg, + func); } /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * It is to be used only for System RAM. - * - * This will find System RAM ranges that are children of top-level resources - * in addition to top-level System RAM resources. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -485,8 +525,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; while (start < end && - !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, - false, &res)) { + !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) { pfn = PFN_UP(res.start); end_pfn = PFN_DOWN(res.end + 1); if (end_pfn > pfn) @@ -513,6 +552,70 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); +static int __region_intersects(struct resource *parent, resource_size_t start, + size_t size, unsigned long flags, + unsigned long desc) +{ + int type = 0; int other = 0; + struct resource *p, *dp; + struct resource res, o; + bool covered; + + res.start = start; + res.end = start + size - 1; + + for (p = parent->child; p ; p = p->sibling) { + if (!resource_intersection(p, &res, &o)) + continue; + if (is_type_match(p, flags, desc)) { + type++; + continue; + } + /* + * Continue to search in descendant resources as if the + * matched descendant resources cover some ranges of 'p'. + * + * |------------- "CXL Window 0" ------------| + * |-- "System RAM" --| + * + * will behave similar as the following fake resource + * tree when searching "System RAM". + * + * |-- "System RAM" --||-- "CXL Window 0a" --| + */ + covered = false; + for_each_resource(p, dp, false) { + if (!resource_overlaps(dp, &res)) + continue; + if (is_type_match(dp, flags, desc)) { + type++; + /* + * Range from 'o.start' to 'dp->start' + * isn't covered by matched resource. + */ + if (dp->start > o.start) + break; + if (dp->end >= o.end) { + covered = true; + break; + } + /* Remove covered range */ + o.start = max(o.start, dp->end + 1); + } + } + if (!covered) + other++; + } + + if (type == 0) + return REGION_DISJOINT; + + if (other == 0) + return REGION_INTERSECTS; + + return REGION_MIXED; +} + /** * region_intersects() - determine intersection of region with known resources * @start: region start address @@ -536,31 +639,13 @@ EXPORT_SYMBOL_GPL(page_is_ram); int region_intersects(resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - struct resource res; - int type = 0; int other = 0; - struct resource *p; - - res.start = start; - res.end = start + size - 1; + int ret; read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - bool is_type = (((p->flags & flags) == flags) && - ((desc == IORES_DESC_NONE) || - (desc == p->desc))); - - if (resource_overlaps(p, &res)) - is_type ? type++ : other++; - } + ret = __region_intersects(&iomem_resource, start, size, flags, desc); read_unlock(&resource_lock); - if (other == 0) - return type ? REGION_INTERSECTS : REGION_DISJOINT; - - if (type) - return REGION_MIXED; - - return REGION_DISJOINT; + return ret; } EXPORT_SYMBOL_GPL(region_intersects); @@ -568,14 +653,6 @@ void __weak arch_remove_reservations(struct resource *avail) { } -static resource_size_t simple_align_resource(void *data, - const struct resource *avail, - resource_size_t size, - resource_size_t align) -{ - return avail->start; -} - static void resource_clip(struct resource *res, resource_size_t min, resource_size_t max) { @@ -586,16 +663,16 @@ static void resource_clip(struct resource *res, resource_size_t min, } /* - * Find empty slot in the resource tree with the given range and + * Find empty space in the resource tree with the given range and * alignment constraints */ -static int __find_resource(struct resource *root, struct resource *old, - struct resource *new, - resource_size_t size, - struct resource_constraint *constraint) +static int __find_resource_space(struct resource *root, struct resource *old, + struct resource *new, resource_size_t size, + struct resource_constraint *constraint) { struct resource *this = root->child; struct resource tmp = *new, avail, alloc; + resource_alignf alignf = constraint->alignf; tmp.start = root->start; /* @@ -624,8 +701,12 @@ static int __find_resource(struct resource *root, struct resource *old, avail.flags = new->flags & ~IORESOURCE_UNSET; if (avail.start >= tmp.start) { alloc.flags = avail.flags; - alloc.start = constraint->alignf(constraint->alignf_data, &avail, - size, constraint->align); + if (alignf) { + alloc.start = alignf(constraint->alignf_data, + &avail, size, constraint->align); + } else { + alloc.start = avail.start; + } alloc.end = alloc.start + size - 1; if (alloc.start <= alloc.end && resource_contains(&avail, &alloc)) { @@ -645,15 +726,27 @@ next: if (!this || this->end == root->end) return -EBUSY; } -/* - * Find empty slot in the resource tree given range and alignment. +/** + * find_resource_space - Find empty space in the resource tree + * @root: Root resource descriptor + * @new: Resource descriptor awaiting an empty resource space + * @size: The minimum size of the empty space + * @constraint: The range and alignment constraints to be met + * + * Finds an empty space under @root in the resource tree satisfying range and + * alignment @constraints. + * + * Return: + * * %0 - if successful, @new members start, end, and flags are altered. + * * %-EBUSY - if no empty space was found. */ -static int find_resource(struct resource *root, struct resource *new, +int find_resource_space(struct resource *root, struct resource *new, resource_size_t size, - struct resource_constraint *constraint) + struct resource_constraint *constraint) { - return __find_resource(root, NULL, new, size, constraint); + return __find_resource_space(root, NULL, new, size, constraint); } +EXPORT_SYMBOL_GPL(find_resource_space); /** * reallocate_resource - allocate a slot in the resource tree given range & alignment. @@ -663,7 +756,7 @@ static int find_resource(struct resource *root, struct resource *new, * @root: root resource descriptor * @old: resource descriptor desired by caller * @newsize: new size of the resource descriptor - * @constraint: the size and alignment constraints to be met. + * @constraint: the memory range and alignment constraints to be met. */ static int reallocate_resource(struct resource *root, struct resource *old, resource_size_t newsize, @@ -675,7 +768,7 @@ static int reallocate_resource(struct resource *root, struct resource *old, write_lock(&resource_lock); - if ((err = __find_resource(root, old, &new, newsize, constraint))) + if ((err = __find_resource_space(root, old, &new, newsize, constraint))) goto out; if (resource_contains(&new, old)) { @@ -719,18 +812,12 @@ out: int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, - resource_size_t (*alignf)(void *, - const struct resource *, - resource_size_t, - resource_size_t), + resource_alignf alignf, void *alignf_data) { int err; struct resource_constraint constraint; - if (!alignf) - alignf = simple_align_resource; - constraint.min = min; constraint.max = max; constraint.align = align; @@ -744,7 +831,7 @@ int allocate_resource(struct resource *root, struct resource *new, } write_lock(&resource_lock); - err = find_resource(root, new, size, &constraint); + err = find_resource_space(root, new, size, &constraint); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; write_unlock(&resource_lock); @@ -901,10 +988,17 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) if (conflict->end > new->end) new->end = conflict->end; - printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); + pr_info("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); } write_unlock(&resource_lock); } +/* + * Not for general consumption, only early boot memory map parsing, PCI + * resource discovery, and late discovery of CXL resources are expected + * to use this interface. The former are built-in and only the latter, + * CXL, is a module. + */ +EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, "CXL"); /** * remove_resource - Remove a resource in the resource tree @@ -1112,31 +1206,65 @@ resource_size_t resource_alignment(struct resource *res) static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait); -/** - * __request_region - create a new busy resource region - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * @name: reserving caller's ID string - * @flags: IO resource flags - */ -struct resource * __request_region(struct resource *parent, +static struct inode *iomem_inode; + +#ifdef CONFIG_IO_STRICT_DEVMEM +static void revoke_iomem(struct resource *res) +{ + /* pairs with smp_store_release() in iomem_init_inode() */ + struct inode *inode = smp_load_acquire(&iomem_inode); + + /* + * Check that the initialization has completed. Losing the race + * is ok because it means drivers are claiming resources before + * the fs_initcall level of init and prevent iomem_get_mapping users + * from establishing mappings. + */ + if (!inode) + return; + + /* + * The expectation is that the driver has successfully marked + * the resource busy by this point, so devmem_is_allowed() + * should start returning false, however for performance this + * does not iterate the entire resource range. + */ + if (devmem_is_allowed(PHYS_PFN(res->start)) && + devmem_is_allowed(PHYS_PFN(res->end))) { + /* + * *cringe* iomem=relaxed says "go ahead, what's the + * worst that can happen?" + */ + return; + } + + unmap_mapping_range(inode->i_mapping, res->start, resource_size(res), 1); +} +#else +static void revoke_iomem(struct resource *res) {} +#endif + +struct address_space *iomem_get_mapping(void) +{ + /* + * This function is only called from file open paths, hence guaranteed + * that fs_initcalls have completed and no need to check for NULL. But + * since revoke_iomem can be called before the initcall we still need + * the barrier to appease checkers. + */ + return smp_load_acquire(&iomem_inode)->i_mapping; +} + +static int __request_region_locked(struct resource *res, struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); - struct resource *res = alloc_resource(GFP_KERNEL); - struct resource *orig_parent = parent; - - if (!res) - return NULL; res->name = name; res->start = start; res->end = start + n - 1; - write_lock(&resource_lock); - for (;;) { struct resource *conflict; @@ -1172,14 +1300,41 @@ struct resource * __request_region(struct resource *parent, continue; } /* Uhhuh, that didn't work out.. */ - free_resource(res); - res = NULL; - break; + return -EBUSY; } + + return 0; +} + +/** + * __request_region - create a new busy resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * @name: reserving caller's ID string + * @flags: IO resource flags + */ +struct resource *__request_region(struct resource *parent, + resource_size_t start, resource_size_t n, + const char *name, int flags) +{ + struct resource *res = alloc_resource(GFP_KERNEL); + int ret; + + if (!res) + return NULL; + + write_lock(&resource_lock); + ret = __request_region_locked(res, parent, start, n, name, flags); write_unlock(&resource_lock); - if (res && orig_parent == &iomem_resource) - revoke_devmem(res); + if (ret) { + free_resource(res); + return NULL; + } + + if (parent == &iomem_resource) + revoke_iomem(res); return res; } @@ -1228,16 +1383,13 @@ void __release_region(struct resource *parent, resource_size_t start, write_unlock(&resource_lock); - printk(KERN_WARNING "Trying to free nonexistent resource " - "<%016llx-%016llx>\n", (unsigned long long)start, - (unsigned long long)end); + pr_warn("Trying to free nonexistent resource <%pa-%pa>\n", &start, &end); } EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE /** * release_mem_region_adjustable - release a previously reserved memory region - * @parent: parent resource descriptor * @start: resource start address * @size: resource region size * @@ -1255,21 +1407,28 @@ EXPORT_SYMBOL(__release_region); * assumes that all children remain in the lower address entry for * simplicity. Enhance this logic when necessary. */ -int release_mem_region_adjustable(struct resource *parent, - resource_size_t start, resource_size_t size) +void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { + struct resource *parent = &iomem_resource; + struct resource *new_res = NULL; + bool alloc_nofail = false; struct resource **p; struct resource *res; - struct resource *new_res; resource_size_t end; - int ret = -EINVAL; end = start + size - 1; - if ((start < parent->start) || (end > parent->end)) - return ret; + if (WARN_ON_ONCE((start < parent->start) || (end > parent->end))) + return; - /* The alloc_resource() result gets checked later */ - new_res = alloc_resource(GFP_KERNEL); + /* + * We free up quite a lot of memory on memory hotunplug (esp., memap), + * just before releasing the region. This is highly unlikely to + * fail - let's play save and make it never fail as the caller cannot + * perform any error handling (e.g., trying to re-add memory will fail + * similarly). + */ +retry: + new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0)); p = &parent->child; write_lock(&resource_lock); @@ -1284,21 +1443,6 @@ int release_mem_region_adjustable(struct resource *parent, continue; } - /* - * All memory regions added from memory-hotplug path have the - * flag IORESOURCE_SYSTEM_RAM. If the resource does not have - * this flag, we know that we are dealing with a resource coming - * from HMM/devm. HMM/devm use another mechanism to add/release - * a resource. This goes via devm_request_mem_region and - * devm_release_mem_region. - * HMM/devm take care to release their resources when they want, - * so if we are dealing with them, let us just back off here. - */ - if (!(res->flags & IORESOURCE_SYSRAM)) { - ret = 0; - break; - } - if (!(res->flags & IORESOURCE_MEM)) break; @@ -1312,20 +1456,23 @@ int release_mem_region_adjustable(struct resource *parent, /* free the whole entry */ *p = res->sibling; free_resource(res); - ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ - ret = __adjust_resource(res, end + 1, - res->end - end); + WARN_ON_ONCE(__adjust_resource(res, end + 1, + res->end - end)); } else if (res->start != start && res->end == end) { /* adjust the end */ - ret = __adjust_resource(res, res->start, - start - res->start); + WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start)); } else { - /* split into two entries */ + /* split into two entries - we need a new resource */ if (!new_res) { - ret = -ENOMEM; - break; + new_res = alloc_resource(GFP_ATOMIC); + if (!new_res) { + alloc_nofail = true; + write_unlock(&resource_lock); + goto retry; + } } new_res->name = res->name; new_res->start = end + 1; @@ -1336,9 +1483,8 @@ int release_mem_region_adjustable(struct resource *parent, new_res->sibling = res->sibling; new_res->child = NULL; - ret = __adjust_resource(res, res->start, - start - res->start); - if (ret) + if (WARN_ON_ONCE(__adjust_resource(res, res->start, + start - res->start))) break; res->sibling = new_res; new_res = NULL; @@ -1349,10 +1495,69 @@ int release_mem_region_adjustable(struct resource *parent, write_unlock(&resource_lock); free_resource(new_res); - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ +#ifdef CONFIG_MEMORY_HOTPLUG +static bool system_ram_resources_mergeable(struct resource *r1, + struct resource *r2) +{ + /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */ + return r1->flags == r2->flags && r1->end + 1 == r2->start && + r1->name == r2->name && r1->desc == r2->desc && + !r1->child && !r2->child; +} + +/** + * merge_system_ram_resource - mark the System RAM resource mergeable and try to + * merge it with adjacent, mergeable resources + * @res: resource descriptor + * + * This interface is intended for memory hotplug, whereby lots of contiguous + * system ram resources are added (e.g., via add_memory*()) by a driver, and + * the actual resource boundaries are not of interest (e.g., it might be + * relevant for DIMMs). Only resources that are marked mergeable, that have the + * same parent, and that don't have any children are considered. All mergeable + * resources must be immutable during the request. + * + * Note: + * - The caller has to make sure that no pointers to resources that are + * marked mergeable are used anymore after this call - the resource might + * be freed and the pointer might be stale! + * - release_mem_region_adjustable() will split on demand on memory hotunplug + */ +void merge_system_ram_resource(struct resource *res) +{ + const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + struct resource *cur; + + if (WARN_ON_ONCE((res->flags & flags) != flags)) + return; + + write_lock(&resource_lock); + res->flags |= IORESOURCE_SYSRAM_MERGEABLE; + + /* Try to merge with next item in the list. */ + cur = res->sibling; + if (cur && system_ram_resources_mergeable(res, cur)) { + res->end = cur->end; + res->sibling = cur->sibling; + free_resource(cur); + } + + /* Try to merge with previous item in the list. */ + cur = res->parent->child; + while (cur && cur->sibling != res) + cur = cur->sibling; + if (cur && system_ram_resources_mergeable(cur, res)) { + cur->end = res->end; + cur->sibling = res->sibling; + free_resource(res); + } + write_unlock(&resource_lock); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + /* * Managed region resource */ @@ -1478,8 +1683,7 @@ void __devm_release_region(struct device *dev, struct resource *parent, { struct region_devres match_data = { parent, start, n }; - __release_region(parent, start, n); - WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, + WARN_ON(devres_release(dev, devm_region_release, devm_region_match, &match_data)); } EXPORT_SYMBOL(__devm_release_region); @@ -1536,22 +1740,22 @@ __setup("reserve=", reserve_setup); */ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) { - struct resource *p = &iomem_resource; + resource_size_t end = addr + size - 1; + struct resource *p; int err = 0; - loff_t l; read_lock(&resource_lock); - for (p = p->child; p ; p = r_next(NULL, p, &l)) { + for_each_resource(&iomem_resource, p, false) { /* * We can probably skip the resources without * IORESOURCE_IO attribute? */ - if (p->start >= addr + size) + if (p->start > end) continue; if (p->end < addr) continue; if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && - PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) + PFN_DOWN(p->end) >= PFN_DOWN(end)) continue; /* * if a resource is "BUSY", it's not a hardware resource @@ -1562,10 +1766,8 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) if (p->flags & IORESOURCE_BUSY) continue; - printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n", - (unsigned long long)addr, - (unsigned long long)(addr + size - 1), - p->name, p); + pr_warn("resource sanity check: requesting [mem %pa-%pa], which spans more than %s %pR\n", + &addr, &end, p->name, p); err = -1; break; } @@ -1581,37 +1783,46 @@ static int strict_iomem_checks; #endif /* - * check if an address is reserved in the iomem resource tree - * returns true if reserved, false if not reserved. + * Check if an address is exclusive to the kernel and must not be mapped to + * user space, for example, via /dev/mem. + * + * Returns true if exclusive to the kernel, otherwise returns false. */ -bool iomem_is_exclusive(u64 addr) +bool resource_is_exclusive(struct resource *root, u64 addr, resource_size_t size) { - struct resource *p = &iomem_resource; - bool err = false; - loff_t l; - int size = PAGE_SIZE; - - if (!strict_iomem_checks) - return false; - - addr = addr & PAGE_MASK; + const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM | + IORESOURCE_EXCLUSIVE; + bool skip_children = false, err = false; + struct resource *p; read_lock(&resource_lock); - for (p = p->child; p ; p = r_next(NULL, p, &l)) { - /* - * We can probably skip the resources without - * IORESOURCE_IO attribute? - */ + for_each_resource(root, p, skip_children) { if (p->start >= addr + size) break; - if (p->end < addr) + if (p->end < addr) { + skip_children = true; continue; + } + skip_children = false; + + /* + * IORESOURCE_SYSTEM_RAM resources are exclusive if + * IORESOURCE_EXCLUSIVE is set, even if they + * are not busy and even if "iomem=relaxed" is set. The + * responsible driver dynamically adds/removes system RAM within + * such an area and uncontrolled access is dangerous. + */ + if ((p->flags & exclusive_system_ram) == exclusive_system_ram) { + err = true; + break; + } + /* * A resource is exclusive if IORESOURCE_EXCLUSIVE is set * or CONFIG_IO_STRICT_DEVMEM is enabled and the * resource is busy. */ - if ((p->flags & IORESOURCE_BUSY) == 0) + if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY)) continue; if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) || p->flags & IORESOURCE_EXCLUSIVE) { @@ -1624,6 +1835,12 @@ bool iomem_is_exclusive(u64 addr) return err; } +bool iomem_is_exclusive(u64 addr) +{ + return resource_is_exclusive(&iomem_resource, addr & PAGE_MASK, + PAGE_SIZE); +} + struct resource_entry *resource_list_create_entry(struct resource *res, size_t extra_size) { @@ -1648,31 +1865,141 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); -#ifdef CONFIG_DEVICE_PRIVATE -static struct resource *__request_free_mem_region(struct device *dev, - struct resource *base, unsigned long size, const char *name) +#ifdef CONFIG_GET_FREE_REGION +#define GFR_DESCENDING (1UL << 0) +#define GFR_REQUEST_REGION (1UL << 1) +#ifdef PA_SECTION_SHIFT +#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) +#else +#define GFR_DEFAULT_ALIGN PAGE_SIZE +#endif + +static resource_size_t gfr_start(struct resource *base, resource_size_t size, + resource_size_t align, unsigned long flags) +{ + if (flags & GFR_DESCENDING) { + resource_size_t end; + + end = min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END); + return end - size + 1; + } + + return ALIGN(max(base->start, align), align); +} + +static bool gfr_continue(struct resource *base, resource_size_t addr, + resource_size_t size, unsigned long flags) +{ + if (flags & GFR_DESCENDING) + return addr > size && addr >= base->start; + /* + * In the ascend case be careful that the last increment by + * @size did not wrap 0. + */ + return addr > addr - size && + addr <= min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END); +} + +static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, + unsigned long flags) +{ + if (flags & GFR_DESCENDING) + return addr - size; + return addr + size; +} + +static void remove_free_mem_region(void *_res) +{ + struct resource *res = _res; + + if (res->parent) + remove_resource(res); + free_resource(res); +} + +static struct resource * +get_free_mem_region(struct device *dev, struct resource *base, + resource_size_t size, const unsigned long align, + const char *name, const unsigned long desc, + const unsigned long flags) { - resource_size_t end, addr; + resource_size_t addr; struct resource *res; + struct region_devres *dr = NULL; - size = ALIGN(size, 1UL << PA_SECTION_SHIFT); - end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); - addr = end - size + 1UL; + size = ALIGN(size, align); - for (; addr > size && addr >= base->start; addr -= size) { - if (region_intersects(addr, size, 0, IORES_DESC_NONE) != - REGION_DISJOINT) - continue; + res = alloc_resource(GFP_KERNEL); + if (!res) + return ERR_PTR(-ENOMEM); - if (dev) - res = devm_request_mem_region(dev, addr, size, name); - else - res = request_mem_region(addr, size, name); - if (!res) + if (dev && (flags & GFR_REQUEST_REGION)) { + dr = devres_alloc(devm_region_release, + sizeof(struct region_devres), GFP_KERNEL); + if (!dr) { + free_resource(res); return ERR_PTR(-ENOMEM); - res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + } + } else if (dev) { + if (devm_add_action_or_reset(dev, remove_free_mem_region, res)) + return ERR_PTR(-ENOMEM); + } + + write_lock(&resource_lock); + for (addr = gfr_start(base, size, align, flags); + gfr_continue(base, addr, align, flags); + addr = gfr_next(addr, align, flags)) { + if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) != + REGION_DISJOINT) + continue; + + if (flags & GFR_REQUEST_REGION) { + if (__request_region_locked(res, &iomem_resource, addr, + size, name, 0)) + break; + + if (dev) { + dr->parent = &iomem_resource; + dr->start = addr; + dr->n = size; + devres_add(dev, dr); + } + + res->desc = desc; + write_unlock(&resource_lock); + + + /* + * A driver is claiming this region so revoke any + * mappings. + */ + revoke_iomem(res); + } else { + res->start = addr; + res->end = addr + size - 1; + res->name = name; + res->desc = desc; + res->flags = IORESOURCE_MEM; + + /* + * Only succeed if the resource hosts an exclusive + * range after the insert + */ + if (__insert_resource(base, res) || res->child) + break; + + write_unlock(&resource_lock); + } + return res; } + write_unlock(&resource_lock); + + if (flags & GFR_REQUEST_REGION) { + free_resource(res); + devres_free(dr); + } else if (dev) + devm_release_action(dev, remove_free_mem_region, res); return ERR_PTR(-ERANGE); } @@ -1691,18 +2018,48 @@ static struct resource *__request_free_mem_region(struct device *dev, struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size) { - return __request_free_mem_region(dev, base, size, dev_name(dev)); + unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; + + return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN, + dev_name(dev), + IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(devm_request_free_mem_region); struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name) { - return __request_free_mem_region(NULL, base, size, name); + unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; + + return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name, + IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(request_free_mem_region); -#endif /* CONFIG_DEVICE_PRIVATE */ +/** + * alloc_free_mem_region - find a free region relative to @base + * @base: resource that will parent the new resource + * @size: size in bytes of memory to allocate from @base + * @align: alignment requirements for the allocation + * @name: resource name + * + * Buses like CXL, that can dynamically instantiate new memory regions, + * need a method to allocate physical address space for those regions. + * Allocate and insert a new resource to cover a free, unclaimed by a + * descendant of @base, range in the span of @base. + */ +struct resource *alloc_free_mem_region(struct resource *base, + unsigned long size, unsigned long align, + const char *name) +{ + /* Default of ascending direction and insert resource */ + unsigned long flags = 0; + + return get_free_mem_region(NULL, base, size, align, name, + IORES_DESC_NONE, flags); +} +EXPORT_SYMBOL_GPL(alloc_free_mem_region); +#endif /* CONFIG_GET_FREE_REGION */ static int __init strict_iomem(char *str) { @@ -1713,4 +2070,48 @@ static int __init strict_iomem(char *str) return 1; } +static int iomem_fs_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, DEVMEM_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type iomem_fs_type = { + .name = "iomem", + .owner = THIS_MODULE, + .init_fs_context = iomem_fs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int __init iomem_init_inode(void) +{ + static struct vfsmount *iomem_vfs_mount; + static int iomem_fs_cnt; + struct inode *inode; + int rc; + + rc = simple_pin_fs(&iomem_fs_type, &iomem_vfs_mount, &iomem_fs_cnt); + if (rc < 0) { + pr_err("Cannot mount iomem pseudo filesystem: %d\n", rc); + return rc; + } + + inode = alloc_anon_inode(iomem_vfs_mount->mnt_sb); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + pr_err("Cannot allocate inode for iomem: %d\n", rc); + simple_release_fs(&iomem_vfs_mount, &iomem_fs_cnt); + return rc; + } + + /* + * Publish iomem revocation inode initialized. + * Pairs with smp_load_acquire() in revoke_iomem(). + */ + smp_store_release(&iomem_inode, inode); + + return 0; +} + +fs_initcall(iomem_init_inode); + __setup("iomem=", strict_iomem); |