diff options
Diffstat (limited to 'mm/nommu.c')
| -rw-r--r-- | mm/nommu.c | 847 |
1 files changed, 401 insertions, 446 deletions
diff --git a/mm/nommu.c b/mm/nommu.c index 749276beb109..c3a23b082adb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1,10 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/nommu.c * * Replacement code for mm functions to support CPU's that don't * have any form of memory management unit (thus no virtual memory). * - * See Documentation/nommu-mmap.txt + * See Documentation/admin-guide/mm/nommu-mmap.rst * * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com> * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> @@ -18,7 +19,6 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/sched/mm.h> -#include <linux/vmacache.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/file.h> @@ -26,7 +26,6 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/compiler.h> #include <linux/mount.h> @@ -37,23 +36,17 @@ #include <linux/printk.h> #include <linux/uaccess.h> +#include <linux/uio.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include "internal.h" -void *high_memory; -EXPORT_SYMBOL(high_memory); -struct page *mem_map; -unsigned long max_mapnr; -EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; -int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; -EXPORT_SYMBOL(mem_map); /* list of mapped, potentially shareable regions */ static struct kmem_cache *vm_region_jar; @@ -71,7 +64,7 @@ const struct vm_operations_struct generic_file_vm_ops = { */ unsigned int kobjsize(const void *objp) { - struct page *page; + struct folio *folio; /* * If the object we have should not have ksize performed on it, @@ -80,22 +73,22 @@ unsigned int kobjsize(const void *objp) if (!objp || !virt_addr_valid(objp)) return 0; - page = virt_to_head_page(objp); + folio = virt_to_folio(objp); /* * If the allocator sets PageSlab, we know the pointer came from * kmalloc(). */ - if (PageSlab(page)) + if (folio_test_slab(folio)) return ksize(objp); /* - * If it's not a compound page, see if we have a matching VMA + * If it's not a large folio, see if we have a matching VMA * region. This test is intentionally done in reverse order, * so if there's no VMA, we still fall through and hand back - * PAGE_SIZE for 0-order pages. + * PAGE_SIZE for 0-order folios. */ - if (!PageCompound(page)) { + if (!folio_test_large(folio)) { struct vm_area_struct *vma; vma = find_vma(current->mm, (unsigned long)objp); @@ -107,159 +100,68 @@ unsigned int kobjsize(const void *objp) * The ksize() function is only guaranteed to work for pointers * returned by kmalloc(). So handle arbitrary pointers here. */ - return PAGE_SIZE << compound_order(page); + return folio_size(folio); } -static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int foll_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) -{ - struct vm_area_struct *vma; - unsigned long vm_flags; - int i; - - /* calculate required read or write permissions. - * If FOLL_FORCE is set, we only require the "MAY" flags. - */ - vm_flags = (foll_flags & FOLL_WRITE) ? - (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= (foll_flags & FOLL_FORCE) ? - (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - - for (i = 0; i < nr_pages; i++) { - vma = find_vma(mm, start); - if (!vma) - goto finish_or_fault; - - /* protect what we can, including chardevs */ - if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || - !(vm_flags & vma->vm_flags)) - goto finish_or_fault; - - if (pages) { - pages[i] = virt_to_page(start); - if (pages[i]) - get_page(pages[i]); - } - if (vmas) - vmas[i] = vma; - start = (start + PAGE_SIZE) & PAGE_MASK; - } - - return i; - -finish_or_fault: - return i ? : -EFAULT; -} - -/* - * get a list of pages in an address range belonging to the specified process - * and indicate the VMA that covers each page - * - this is potentially dodgy as we may end incrementing the page count of a - * slab page or a secondary page from a compound page - * - don't permit access to VMAs that don't support it, such as I/O mappings - */ -long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) -{ - return __get_user_pages(current, current->mm, start, nr_pages, - gup_flags, pages, vmas, NULL); -} -EXPORT_SYMBOL(get_user_pages); - -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - return get_user_pages(start, nr_pages, gup_flags, pages, NULL); -} -EXPORT_SYMBOL(get_user_pages_locked); - -static long __get_user_pages_unlocked(struct task_struct *tsk, - struct mm_struct *mm, unsigned long start, - unsigned long nr_pages, struct page **pages, - unsigned int gup_flags) -{ - long ret; - down_read(&mm->mmap_sem); - ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages, - NULL, NULL); - up_read(&mm->mmap_sem); - return ret; -} - -long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) -{ - return __get_user_pages_unlocked(current, current->mm, start, nr_pages, - pages, gup_flags); -} -EXPORT_SYMBOL(get_user_pages_unlocked); - -/** - * follow_pfn - look up PFN at a user virtual address - * @vma: memory mapping - * @address: user virtual address - * @pfn: location to store found PFN - * - * Only IO mappings and raw PFN mappings are allowed. - * - * Returns zero and the pfn at @pfn on success, -ve otherwise. - */ -int follow_pfn(struct vm_area_struct *vma, unsigned long address, - unsigned long *pfn) -{ - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) - return -EINVAL; - - *pfn = address >> PAGE_SHIFT; - return 0; -} -EXPORT_SYMBOL(follow_pfn); - -LIST_HEAD(vmap_area_list); - void vfree(const void *addr) { kfree(addr); } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() * returns only a logical address. */ - return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); + return kmalloc_noprof(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); +} +EXPORT_SYMBOL(__vmalloc_noprof); + +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int node) +{ + return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM); +} + +void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) +{ + return __vmalloc_noprof(size, gfp_mask); } -EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller) { - return __vmalloc(size, flags, PAGE_KERNEL); + return __vmalloc_noprof(size, gfp_mask); } -void *vmalloc_user(unsigned long size) +static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + ret = __vmalloc(size, flags); if (ret) { struct vm_area_struct *vma; - down_write(¤t->mm->mmap_sem); + mmap_write_lock(current->mm); vma = find_vma(current->mm, (unsigned long)ret); if (vma) - vma->vm_flags |= VM_USERMAP; - up_write(¤t->mm->mmap_sem); + vm_flags_set(vma, VM_USERMAP); + mmap_write_unlock(current->mm); } return ret; } -EXPORT_SYMBOL(vmalloc_user); + +void *vmalloc_user_noprof(unsigned long size) +{ + return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); +} +EXPORT_SYMBOL(vmalloc_user_noprof); struct page *vmalloc_to_page(const void *addr) { @@ -273,24 +175,13 @@ unsigned long vmalloc_to_pfn(const void *addr) } EXPORT_SYMBOL(vmalloc_to_pfn); -long vread(char *buf, char *addr, unsigned long count) -{ - /* Don't allow overflow */ - if ((unsigned long) buf + count < count) - count = -(unsigned long) buf; - - memcpy(buf, addr, count); - return count; -} - -long vwrite(char *buf, char *addr, unsigned long count) +long vread_iter(struct iov_iter *iter, const char *addr, size_t count) { /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; - memcpy(addr, buf, count); - return count; + return copy_to_iter(addr, count, iter); } /* @@ -304,11 +195,29 @@ long vwrite(char *buf, char *addr, unsigned long count) * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vmalloc(unsigned long size) +void *vmalloc_noprof(unsigned long size) +{ + return __vmalloc_noprof(size, GFP_KERNEL); +} +EXPORT_SYMBOL(vmalloc_noprof); + +/* + * vmalloc_huge_node - allocate virtually contiguous memory, on a node + * + * @size: allocation size + * @gfp_mask: flags for the page level allocator + * @node: node to use for allocation or NUMA_NO_NODE + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * Due to NOMMU implications the node argument and HUGE page attribute is + * ignored. + */ +void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc_noprof(size, gfp_mask); } -EXPORT_SYMBOL(vmalloc); /* * vzalloc - allocate virtually contiguous memory with zero fill @@ -322,12 +231,11 @@ EXPORT_SYMBOL(vmalloc); * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vzalloc(unsigned long size) +void *vzalloc_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc_noprof(size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vzalloc); +EXPORT_SYMBOL(vzalloc_noprof); /** * vmalloc_node - allocate memory on a specific node @@ -340,11 +248,11 @@ EXPORT_SYMBOL(vzalloc); * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vmalloc_node(unsigned long size, int node) +void *vmalloc_node_noprof(unsigned long size, int node) { - return vmalloc(size); + return vmalloc_noprof(size); } -EXPORT_SYMBOL(vmalloc_node); +EXPORT_SYMBOL(vmalloc_node_noprof); /** * vzalloc_node - allocate memory on a specific node with zero fill @@ -358,28 +266,11 @@ EXPORT_SYMBOL(vmalloc_node); * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ -void *vzalloc_node(unsigned long size, int node) -{ - return vzalloc(size); -} -EXPORT_SYMBOL(vzalloc_node); - -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - */ - -void *vmalloc_exec(unsigned long size) +void *vzalloc_node_noprof(unsigned long size, int node) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); + return vzalloc_noprof(size); } +EXPORT_SYMBOL(vzalloc_node_noprof); /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) @@ -388,11 +279,11 @@ void *vmalloc_exec(unsigned long size) * Allocate enough 32bit PA addressable pages to cover @size from the * page level allocator and map them into contiguous kernel virtual space. */ -void *vmalloc_32(unsigned long size) +void *vmalloc_32_noprof(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_noprof(size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc_32); +EXPORT_SYMBOL(vmalloc_32_noprof); /** * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory @@ -404,15 +295,15 @@ EXPORT_SYMBOL(vmalloc_32); * VM_USERMAP is set on the corresponding VMA so that subsequent calls to * remap_vmalloc_range() are permissible. */ -void *vmalloc_32_user(unsigned long size) +void *vmalloc_32_user_noprof(unsigned long size) { /* * We'll have to sort out the ZONE_DMA bits for 64-bit, * but for now this can simply use vmalloc_user() directly. */ - return vmalloc_user(size); + return vmalloc_user_noprof(size); } -EXPORT_SYMBOL(vmalloc_32_user); +EXPORT_SYMBOL(vmalloc_32_user_noprof); void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { @@ -427,7 +318,7 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { BUG(); return NULL; @@ -445,21 +336,6 @@ void vm_unmap_aliases(void) } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -/* - * Implement a stub for vmalloc_sync_all() if the architecture chose not to - * have one. - */ -void __weak vmalloc_sync_all(void) -{ -} - -struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) -{ - BUG(); - return NULL; -} -EXPORT_SYMBOL_GPL(alloc_vm_area); - void free_vm_area(struct vm_struct *area) { BUG(); @@ -473,6 +349,27 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_pages); + +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages); + +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages_zero); + /* * sys_brk() for the most part doesn't need the global kernel * lock, except when an application is doing something nasty @@ -501,12 +398,26 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ - flush_icache_range(mm->brk, brk); + flush_icache_user_range(mm->brk, brk); return mm->brk = brk; } +static int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; + +static const struct ctl_table nommu_table[] = { + { + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +}; + /* - * initialise the percpu counter for VM and region record slabs + * initialise the percpu counter for VM and region record slabs, initialise VMA + * state. */ void __init mmap_init(void) { @@ -515,6 +426,8 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); + register_sysctl_init("vm", nommu_table); + vma_state_init(); } /* @@ -601,7 +514,7 @@ static void delete_nommu_region(struct vm_region *region) static void free_page_series(unsigned long from, unsigned long to) { for (; from < to; from += PAGE_SIZE) { - struct page *page = virt_to_page(from); + struct page *page = virt_to_page((void *)from); atomic_long_dec(&mmap_pages_allocated); put_page(page); @@ -646,26 +559,13 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } -/* - * add a VMA into a process's mm_struct in the appropriate place in the list - * and tree and add to the address space's page tree also if not an anonymous - * page - * - should be called with mm->mmap_sem held writelocked - */ -static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) +static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) { - struct vm_area_struct *pvma, *prev; - struct address_space *mapping; - struct rb_node **p, *parent, *rb_prev; - - BUG_ON(!vma->vm_region); - - mm->map_count++; vma->vm_mm = mm; /* add the VMA to the mapping */ if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); @@ -673,67 +573,14 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } - - /* add the VMA to the tree */ - parent = rb_prev = NULL; - p = &mm->mm_rb.rb_node; - while (*p) { - parent = *p; - pvma = rb_entry(parent, struct vm_area_struct, vm_rb); - - /* sort by: start addr, end addr, VMA struct addr in that order - * (the latter is necessary as we may get identical VMAs) */ - if (vma->vm_start < pvma->vm_start) - p = &(*p)->rb_left; - else if (vma->vm_start > pvma->vm_start) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma->vm_end < pvma->vm_end) - p = &(*p)->rb_left; - else if (vma->vm_end > pvma->vm_end) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma < pvma) - p = &(*p)->rb_left; - else if (vma > pvma) { - rb_prev = parent; - p = &(*p)->rb_right; - } else - BUG(); - } - - rb_link_node(&vma->vm_rb, parent, p); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); - - /* add VMA to the VMA list also */ - prev = NULL; - if (rb_prev) - prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - - __vma_link_list(mm, vma, prev, parent); } -/* - * delete a VMA from its owning mm_struct and address space - */ -static void delete_vma_from_mm(struct vm_area_struct *vma) -{ - int i; - struct address_space *mapping; - struct mm_struct *mm = vma->vm_mm; - struct task_struct *curr = current; - - mm->map_count--; - for (i = 0; i < VMACACHE_SIZE; i++) { - /* if the vma is cached, invalidate the entire cache */ - if (curr->vmacache.vmas[i] == vma) { - vmacache_invalidate(mm); - break; - } - } - +static void cleanup_vma_from_mm(struct vm_area_struct *vma) +{ + vma->vm_mm->map_count--; /* remove the VMA from the mapping */ if (vma->vm_file) { + struct address_space *mapping; mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); @@ -742,81 +589,80 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} - /* remove from the MM's tree and list */ - rb_erase(&vma->vm_rb, &mm->mm_rb); +/* + * delete a VMA from its owning mm_struct and address space + */ +static int delete_vma_from_mm(struct vm_area_struct *vma) +{ + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); - if (vma->vm_prev) - vma->vm_prev->vm_next = vma->vm_next; - else - mm->mmap = vma->vm_next; + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); + if (vma_iter_prealloc(&vmi, NULL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + cleanup_vma_from_mm(vma); - if (vma->vm_next) - vma->vm_next->vm_prev = vma->vm_prev; + /* remove from the MM's tree and list */ + vma_iter_clear(&vmi); + return 0; } - /* * destroy a VMA record */ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) { - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); + vma_close(vma); if (vma->vm_file) fput(vma->vm_file); put_nommu_region(vma->vm_region); vm_area_free(vma); } -/* - * look up the first VMA in which addr resides, NULL if none - * - should be called with mm->mmap_sem at least held readlocked - */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) { - struct vm_area_struct *vma; + unsigned long index = start_addr; - /* check the cache first */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end > addr) { - vmacache_update(addr, vma); - return vma; - } - } - - return NULL; + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); } -EXPORT_SYMBOL(find_vma); +EXPORT_SYMBOL(find_vma_intersection); /* - * find a VMA - * - we don't extend stack VMAs under NOMMU conditions + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_lock at least held readlocked */ -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - return find_vma(mm, addr); + VMA_ITERATOR(vmi, mm, addr); + + return vma_iter_load(&vmi); } +EXPORT_SYMBOL(find_vma); /* * expand a stack to a given address * - not supported under NOMMU conditions */ -int expand_stack(struct vm_area_struct *vma, unsigned long address) +int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr) { return -ENOMEM; } +struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) +{ + mmap_read_unlock(mm); + return NULL; +} + /* * look up the first VMA exactly that exactly matches addr - * - should be called with mm->mmap_sem at least held readlocked + * - should be called with mm->mmap_lock at least held readlocked */ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long addr, @@ -824,26 +670,17 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long end = addr + len; + VMA_ITERATOR(vmi, mm, addr); - /* check the cache first */ - vma = vmacache_find_exact(mm, addr, end); - if (vma) - return vma; - - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start < addr) - continue; - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end == end) { - vmacache_update(addr, vma); - return vma; - } - } + vma = vma_iter_load(&vmi); + if (!vma) + return NULL; + if (vma->vm_start != addr) + return NULL; + if (vma->vm_end != end) + return NULL; - return NULL; + return vma; } /* @@ -883,7 +720,7 @@ static int validate_mmap_request(struct file *file, if (file) { /* files must support mmap */ - if (!file->f_op->mmap) + if (!can_mmap_file(file)) return -ENODEV; /* work out if what we've got could possibly be shared @@ -934,9 +771,6 @@ static int validate_mmap_request(struct file *file, (file->f_mode & FMODE_WRITE)) return -EACCES; - if (locks_verify_locked(file)) - return -EAGAIN; - if (!(capabilities & NOMMU_MAP_DIRECT)) return -ENODEV; @@ -1011,37 +845,44 @@ static int validate_mmap_request(struct file *file, * we've determined that we can make the mapping, now translate what we * now know into VMA flags */ -static unsigned long determine_vm_flags(struct file *file, - unsigned long prot, - unsigned long flags, - unsigned long capabilities) +static vm_flags_t determine_vm_flags(struct file *file, + unsigned long prot, + unsigned long flags, + unsigned long capabilities) { - unsigned long vm_flags; + vm_flags_t vm_flags; - vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); - /* vm_flags |= mm->def_flags; */ + vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags); - if (!(capabilities & NOMMU_MAP_DIRECT)) { - /* attempt to share read-only copies of mapped file chunks */ + if (!file) { + /* + * MAP_ANONYMOUS. MAP_SHARED is mapped to MAP_PRIVATE, because + * there is no fork(). + */ vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - if (file && !(prot & PROT_WRITE)) - vm_flags |= VM_MAYSHARE; + } else if (flags & MAP_PRIVATE) { + /* MAP_PRIVATE file mapping */ + if (capabilities & NOMMU_MAP_DIRECT) + vm_flags |= (capabilities & NOMMU_VMFLAGS); + else + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + if (!(prot & PROT_WRITE) && !current->ptrace) + /* + * R/O private file mapping which cannot be used to + * modify memory, especially also not via active ptrace + * (e.g., set breakpoints) or later by upgrading + * permissions (no mprotect()). We can try overlaying + * the file mapping, which will work e.g., on chardevs, + * ramfs/tmpfs/shmfs and romfs/cramf. + */ + vm_flags |= VM_MAYOVERLAY; } else { - /* overlay a shareable mapping on the backing device or inode - * if possible - used for chardevs, ramfs/tmpfs/shmfs and - * romfs/cramfs */ - vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS); - if (flags & MAP_SHARED) - vm_flags |= VM_SHARED; + /* MAP_SHARED file mapping: NOMMU_MAP_DIRECT is set. */ + vm_flags |= VM_SHARED | VM_MAYSHARE | + (capabilities & NOMMU_VMFLAGS); } - /* refuse to let anyone share private mappings with this process if - * it's being traced - otherwise breakpoints set in it may interfere - * with another untraced process - */ - if ((flags & MAP_PRIVATE) && current->ptrace) - vm_flags &= ~VM_MAYSHARE; - return vm_flags; } @@ -1053,7 +894,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) { int ret; - ret = call_mmap(vma->vm_file, vma); + ret = mmap_file(vma->vm_file, vma); if (ret == 0) { vma->vm_region->vm_top = vma->vm_region->vm_end; return 0; @@ -1079,15 +920,18 @@ static int do_mmap_private(struct vm_area_struct *vma, void *base; int ret, order; - /* invoke the file's mapping function so that it can keep track of - * shared mappings on devices or memory - * - VM_MAYSHARE will be set if it may attempt to share + /* + * Invoke the file's mapping function so that it can keep track of + * shared mappings on devices or memory. VM_MAYOVERLAY will be set if + * it may attempt to share, which will make is_nommu_shared_mapping() + * happy. */ if (capabilities & NOMMU_MAP_DIRECT) { - ret = call_mmap(vma->vm_file, vma); + ret = mmap_file(vma->vm_file, vma); + /* shouldn't return success if we're not sharing */ + if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags))) + ret = -ENOSYS; if (ret == 0) { - /* shouldn't return success if we're not sharing */ - BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); vma->vm_region->vm_top = vma->vm_region->vm_end; return 0; } @@ -1118,7 +962,8 @@ static int do_mmap_private(struct vm_area_struct *vma, atomic_long_add(total, &mmap_pages_allocated); - region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + vm_flags_set(vma, VM_MAPPED_COPY); + region->vm_flags = vma->vm_flags; region->vm_start = (unsigned long) base; region->vm_end = region->vm_start + len; region->vm_top = region->vm_start + (total << PAGE_SHIFT); @@ -1157,7 +1002,7 @@ error_free: enomem: pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; } @@ -1179,6 +1024,7 @@ unsigned long do_mmap(struct file *file, struct rb_node *rb; unsigned long capabilities, result; int ret; + VMA_ITERATOR(vmi, current->mm, 0); *populate = 0; @@ -1197,6 +1043,7 @@ unsigned long do_mmap(struct file *file, * now know into VMA flags */ vm_flags |= determine_vm_flags(file, prot, flags, capabilities); + /* we're going to need to record the mapping */ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); if (!region) @@ -1210,7 +1057,7 @@ unsigned long do_mmap(struct file *file, region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - vma->vm_flags = vm_flags; + vm_flags_init(vma, vm_flags); vma->vm_pgoff = pgoff; if (file) { @@ -1228,7 +1075,7 @@ unsigned long do_mmap(struct file *file, * these cases, sharing is handled in the driver or filesystem rather * than here */ - if (vm_flags & VM_MAYSHARE) { + if (is_nommu_shared_mapping(vm_flags)) { struct vm_region *pregion; unsigned long pglen, rpglen, pgend, rpgend, start; @@ -1238,7 +1085,7 @@ unsigned long do_mmap(struct file *file, for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { pregion = rb_entry(rb, struct vm_region, vm_rb); - if (!(pregion->vm_flags & VM_MAYSHARE)) + if (!is_nommu_shared_mapping(pregion->vm_flags)) continue; /* search for overlapping mappings on the same file */ @@ -1274,7 +1121,7 @@ unsigned long do_mmap(struct file *file, vma->vm_end = start + len; if (pregion->vm_flags & VM_MAPPED_COPY) - vma->vm_flags |= VM_MAPPED_COPY; + vm_flags_set(vma, VM_MAPPED_COPY); else { ret = do_mmap_shared_file(vma); if (ret < 0) { @@ -1334,7 +1181,9 @@ unsigned long do_mmap(struct file *file, add_nommu_region(region); /* clear anonymous mappings that don't ask for uninitialized data */ - if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) + if (!vma->vm_file && + (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) || + !(flags & MAP_UNINITIALIZED))) memset((void *)region->vm_start, 0, region->vm_end - region->vm_start); @@ -1344,12 +1193,20 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - add_vma_to_mm(current->mm, vma); + BUG_ON(!vma->vm_region); + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) + goto error_just_free; + + setup_vma_to_mm(vma, current->mm); + current->mm->map_count++; + /* add the VMA to the tree */ + vma_iter_store_new(&vmi, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { - flush_icache_range(region->vm_start, region->vm_end); + flush_icache_user_range(region->vm_start, region->vm_end); region->vm_icache_flushed = true; } @@ -1360,6 +1217,7 @@ share: error_just_free: up_write(&nommu_region_sem); error: + vma_iter_free(&vmi); if (region->vm_file) fput(region->vm_file); kmem_cache_free(vm_region_jar, region); @@ -1378,13 +1236,13 @@ error_getting_vma: kmem_cache_free(vm_region_jar, region); pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; error_getting_region: pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; } @@ -1402,8 +1260,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, goto out; } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) @@ -1447,18 +1303,20 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; unsigned long npages; + struct mm_struct *mm; /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) return -ENOMEM; + mm = vma->vm_mm; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1467,10 +1325,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return -ENOMEM; new = vm_area_dup(vma); - if (!new) { - kmem_cache_free(vm_region_jar, region); - return -ENOMEM; - } + if (!new) + goto err_vma_dup; /* most fields are the same, copy all, and then fixup */ *region = *vma->vm_region; @@ -1485,10 +1341,16 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, region->vm_pgoff = new->vm_pgoff += npages; } + vma_iter_config(vmi, new->vm_start, new->vm_end); + if (vma_iter_prealloc(vmi, vma)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + goto err_vmi_preallocate; + } + if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - delete_vma_from_mm(vma); down_write(&nommu_region_sem); delete_nommu_region(vma->vm_region); if (new_below) { @@ -1501,16 +1363,25 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, add_nommu_region(vma->vm_region); add_nommu_region(new->vm_region); up_write(&nommu_region_sem); - add_vma_to_mm(mm, vma); - add_vma_to_mm(mm, new); + + setup_vma_to_mm(vma, mm); + setup_vma_to_mm(new, mm); + vma_iter_store_new(vmi, new); + mm->map_count++; return 0; + +err_vmi_preallocate: + vm_area_free(new); +err_vma_dup: + kmem_cache_free(vm_region_jar, region); + return -ENOMEM; } /* * shrink a VMA by removing the specified chunk from either the beginning or * the end */ -static int shrink_vma(struct mm_struct *mm, +static int vmi_shrink_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long from, unsigned long to) { @@ -1518,12 +1389,15 @@ static int shrink_vma(struct mm_struct *mm, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - delete_vma_from_mm(vma); - if (from > vma->vm_start) + if (from > vma->vm_start) { + if (vma_iter_clear_gfp(vmi, from, vma->vm_end, GFP_KERNEL)) + return -ENOMEM; vma->vm_end = from; - else + } else { + if (vma_iter_clear_gfp(vmi, vma->vm_start, to, GFP_KERNEL)) + return -ENOMEM; vma->vm_start = to; - add_vma_to_mm(mm, vma); + } /* cut the backing region down to size */ region = vma->vm_region; @@ -1551,9 +1425,10 @@ static int shrink_vma(struct mm_struct *mm, */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { + VMA_ITERATOR(vmi, mm, start); struct vm_area_struct *vma; unsigned long end; - int ret; + int ret = 0; len = PAGE_ALIGN(len); if (len == 0) @@ -1562,7 +1437,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list end = start + len; /* find the first potentially overlapping VMA */ - vma = find_vma(mm, start); + vma = vma_find(&vmi, end); if (!vma) { static int limit; if (limit < 5) { @@ -1581,7 +1456,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list return -EINVAL; if (end == vma->vm_end) goto erase_whole_vma; - vma = vma->vm_next; + vma = vma_find(&vmi, end); } while (vma); return -EINVAL; } else { @@ -1595,28 +1470,29 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret < 0) return ret; } - return shrink_vma(mm, vma, start, end); + return vmi_shrink_vma(&vmi, vma, start, end); } erase_whole_vma: - delete_vma_from_mm(vma); - delete_vma(mm, vma); - return 0; + if (delete_vma_from_mm(vma)) + ret = -ENOMEM; + else + delete_vma(mm, vma); + return ret; } -EXPORT_SYMBOL(do_munmap); int vm_munmap(unsigned long addr, size_t len) { struct mm_struct *mm = current->mm; int ret; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); ret = do_munmap(mm, addr, len, NULL); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(vm_munmap); @@ -1631,6 +1507,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) */ void exit_mmap(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; if (!mm) @@ -1638,17 +1515,18 @@ void exit_mmap(struct mm_struct *mm) mm->total_vm = 0; - while ((vma = mm->mmap)) { - mm->mmap = vma->vm_next; - delete_vma_from_mm(vma); + /* + * Lock the mm to avoid assert complaining even though this is the only + * user of the mm + */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + cleanup_vma_from_mm(vma); delete_vma(mm, vma); cond_resched(); } -} - -int vm_brk(unsigned long addr, unsigned long len) -{ - return -ENOMEM; + __mt_destroy(&mm->mm_mt); + mmap_write_unlock(mm); } /* @@ -1686,7 +1564,7 @@ static unsigned long do_mremap(unsigned long addr, if (vma->vm_end != vma->vm_start + old_len) return (unsigned long) -EFAULT; - if (vma->vm_flags & VM_MAYSHARE) + if (is_nommu_shared_mapping(vma->vm_flags)) return (unsigned long) -EPERM; if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) @@ -1703,25 +1581,19 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, { unsigned long ret; - down_write(¤t->mm->mmap_sem); + mmap_write_lock(current->mm); ret = do_mremap(addr, old_len, new_len, flags, new_addr); - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); return ret; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags) -{ - return NULL; -} - int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); return 0; } EXPORT_SYMBOL(remap_pfn_range); @@ -1751,12 +1623,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - return -ENOMEM; -} - vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); @@ -1764,20 +1630,22 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_fault *vmf, +vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); + return 0; } EXPORT_SYMBOL(filemap_map_pages); -int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags) +static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; - down_read(&mm->mmap_sem); + if (mmap_read_lock_killable(mm)) + return 0; /* the access must start within one of the target process's mappings */ vma = find_vma(mm, addr); @@ -1799,7 +1667,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, len = 0; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); return len; } @@ -1817,7 +1685,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); + return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* @@ -1836,13 +1704,92 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (!mm) return 0; - len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + len = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return len; } EXPORT_SYMBOL_GPL(access_process_vm); +#ifdef CONFIG_BPF_SYSCALL +/* + * Copy a string from another process's address space as given in mm. + * If there is any error return -EFAULT. + */ +static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, + void *buf, int len) +{ + unsigned long addr_end; + struct vm_area_struct *vma; + int ret = -EFAULT; + + *(char *)buf = '\0'; + + if (mmap_read_lock_killable(mm)) + return ret; + + /* the access must start within one of the target process's mappings */ + vma = find_vma(mm, addr); + if (!vma) + goto out; + + if (check_add_overflow(addr, len, &addr_end)) + goto out; + + /* don't overrun this mapping */ + if (addr_end > vma->vm_end) + len = vma->vm_end - addr; + + /* only read mappings where it is permitted */ + if (vma->vm_flags & VM_MAYREAD) { + ret = strscpy(buf, (char *)addr, len); + if (ret < 0) + ret = len - 1; + } + +out: + mmap_read_unlock(mm); + return ret; +} + +/** + * copy_remote_vm_str - copy a string from another process's address space. + * @tsk: the task of the target address space + * @addr: start address to read from + * @buf: destination buffer + * @len: number of bytes to copy + * @gup_flags: flags modifying lookup behaviour (unused) + * + * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from @addr (source) to @buf (destination); + * not including the trailing NUL. Always guaranteed to leave NUL-terminated + * buffer. On any error, return -EFAULT. + */ +int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + if (unlikely(len == 0)) + return 0; + + mm = get_task_mm(tsk); + if (!mm) { + *(char *)buf = '\0'; + return -EFAULT; + } + + ret = __copy_remote_vm_str(mm, addr, buf, len); + + mmput(mm); + + return ret; +} +EXPORT_SYMBOL_GPL(copy_remote_vm_str); +#endif /* CONFIG_BPF_SYSCALL */ + /** * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode * @inode: The inode to check @@ -1850,8 +1797,8 @@ EXPORT_SYMBOL_GPL(access_process_vm); * @newsize: The proposed filesize of the inode * * Check the shared mappings on an inode on behalf of a shrinking truncate to - * make sure that that any outstanding VMAs aren't broken and then shrink the - * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't + * make sure that any outstanding VMAs aren't broken and then shrink the + * vm_regions that extend beyond so that do_mmap() doesn't * automatically grant mappings that are too large. */ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, @@ -1919,7 +1866,7 @@ static int __meminit init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -1940,9 +1887,17 @@ static int __meminit init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; } subsys_initcall(init_admin_reserve); + +int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + mmap_write_lock(oldmm); + dup_mm_exe_file(mm, oldmm); + mmap_write_unlock(oldmm); + return 0; +} |
