diff options
Diffstat (limited to 'mm/util.c')
| -rw-r--r-- | mm/util.c | 1514 |
1 files changed, 1287 insertions, 227 deletions
diff --git a/mm/util.c b/mm/util.c index 7441c41d00f6..97cae40c0209 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,61 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/compiler.h> #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/sched/task_stack.h> #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> -#include <asm/uaccess.h> +#include <linux/sysctl.h> +#include <linux/mman.h> +#include <linux/hugetlb.h> +#include <linux/vmalloc.h> +#include <linux/userfaultfd_k.h> +#include <linux/elf.h> +#include <linux/elf-randomize.h> +#include <linux/personality.h> +#include <linux/random.h> +#include <linux/processor.h> +#include <linux/sizes.h> +#include <linux/compat.h> +#include <linux/fsnotify.h> +#include <linux/page_idle.h> + +#include <linux/uaccess.h> + +#include <kunit/visibility.h> #include "internal.h" +#include "swap.h" -#define CREATE_TRACE_POINTS -#include <trace/events/kmem.h> +/** + * kfree_const - conditionally free memory + * @x: pointer to the memory + * + * Function calls kfree only if @x is not in .rodata section. + */ +void kfree_const(const void *x) +{ + if (!is_kernel_rodata((unsigned long)x)) + kfree(x); +} +EXPORT_SYMBOL(kfree_const); /** - * kstrdup - allocate space for and copy an existing string - * @s: the string to duplicate + * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated. + * @s: The data to copy + * @len: The size of the data, not including the NUL terminator * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s with NUL-termination or %NULL in + * case of error */ -char *kstrdup(const char *s, gfp_t gfp) +static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp) { - size_t len; char *buf; - if (!s) + /* '+1' for the NUL terminator */ + buf = kmalloc_track_caller(len + 1, gfp); + if (!buf) return NULL; - len = strlen(s) + 1; - buf = kmalloc_track_caller(len, gfp); - if (buf) - memcpy(buf, s, len); + memcpy(buf, s, len); + /* Ensure the buf is always NUL-terminated, regardless of @s. */ + buf[len] = '\0'; return buf; } + +/** + * kstrdup - allocate space for and copy an existing string + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s or %NULL in case of error + */ +noinline +char *kstrdup(const char *s, gfp_t gfp) +{ + return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL; +} EXPORT_SYMBOL(kstrdup); /** + * kstrdup_const - conditionally duplicate an existing const string + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Note: Strings allocated by kstrdup_const should be freed by kfree_const and + * must not be passed to krealloc(). + * + * Return: source string if it is in .rodata section otherwise + * fallback to kstrdup. + */ +const char *kstrdup_const(const char *s, gfp_t gfp) +{ + if (is_kernel_rodata((unsigned long)s)) + return s; + + return kstrdup(s, gfp); +} +EXPORT_SYMBOL(kstrdup_const); + +/** * kstrndup - allocate space for and copy an existing string * @s: the string to duplicate * @max: read at most @max chars from @s * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Note: Use kmemdup_nul() instead if the size is known exactly. + * + * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { - size_t len; - char *buf; - - if (!s) - return NULL; - - len = strnlen(s, max); - buf = kmalloc_track_caller(len+1, gfp); - if (buf) { - memcpy(buf, s, len); - buf[len] = '\0'; - } - return buf; + return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL; } EXPORT_SYMBOL(kstrndup); @@ -65,143 +127,142 @@ EXPORT_SYMBOL(kstrndup); * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use + * + * Return: newly allocated copy of @src or %NULL in case of error, + * result is physically contiguous. Use kfree() to free. */ -void *kmemdup(const void *src, size_t len, gfp_t gfp) +void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) { void *p; - p = kmalloc_track_caller(len, gfp); + p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); if (p) memcpy(p, src, len); return p; } -EXPORT_SYMBOL(kmemdup); +EXPORT_SYMBOL(kmemdup_noprof); /** - * memdup_user - duplicate memory region from user space + * kmemdup_array - duplicate a given array. * - * @src: source address in user space - * @len: number of bytes to copy + * @src: array to duplicate. + * @count: number of elements to duplicate from array. + * @element_size: size of each element of array. + * @gfp: GFP mask to use. * - * Returns an ERR_PTR() on failure. + * Return: duplicated array of @src or %NULL in case of error, + * result is physically contiguous. Use kfree() to free. */ -void *memdup_user(const void __user *src, size_t len) +void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) { - void *p; - - /* - * Always use GFP_KERNEL, since copy_from_user() can sleep and - * cause pagefault, which makes it pointless to use GFP_NOFS - * or GFP_ATOMIC. - */ - p = kmalloc_track_caller(len, GFP_KERNEL); - if (!p) - return ERR_PTR(-ENOMEM); - - if (copy_from_user(p, src, len)) { - kfree(p); - return ERR_PTR(-EFAULT); - } - - return p; + return kmemdup(src, size_mul(element_size, count), gfp); } -EXPORT_SYMBOL(memdup_user); +EXPORT_SYMBOL(kmemdup_array); -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) +/** + * kvmemdup - duplicate region of memory + * + * @src: memory region to duplicate + * @len: memory region length + * @gfp: GFP mask to use + * + * Return: newly allocated copy of @src or %NULL in case of error, + * result may be not physically contiguous. Use kvfree() to free. + */ +void *kvmemdup(const void *src, size_t len, gfp_t gfp) { - void *ret; - size_t ks = 0; + void *p; + p = kvmalloc(len, gfp); if (p) - ks = ksize(p); - - if (ks >= new_size) - return (void *)p; - - ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); - - return ret; + memcpy(p, src, len); + return p; } +EXPORT_SYMBOL(kvmemdup); /** - * __krealloc - like krealloc() but don't free @p. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. + * kmemdup_nul - Create a NUL-terminated string from unterminated data + * @s: The data to stringify + * @len: The size of the data + * @gfp: the GFP mask used in the kmalloc() call when allocating memory * - * This function is like krealloc() except it never frees the originally - * allocated buffer. Use this if you don't want to free the buffer immediately - * like, for example, with RCU. + * Return: newly allocated copy of @s with NUL-termination or %NULL in + * case of error */ -void *__krealloc(const void *p, size_t new_size, gfp_t flags) +char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { - if (unlikely(!new_size)) - return ZERO_SIZE_PTR; + return s ? __kmemdup_nul(s, len, gfp) : NULL; +} +EXPORT_SYMBOL(kmemdup_nul); + +static kmem_buckets *user_buckets __ro_after_init; - return __do_krealloc(p, new_size, flags); +static int __init init_user_buckets(void) +{ + user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL); + return 0; } -EXPORT_SYMBOL(__krealloc); +subsys_initcall(init_user_buckets); /** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. + * memdup_user - duplicate memory region from user space + * + * @src: source address in user space + * @len: number of bytes to copy * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. + * Return: an ERR_PTR() on failure. Result is physically + * contiguous, to be freed by kfree(). */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) +void *memdup_user(const void __user *src, size_t len) { - void *ret; + void *p; - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } + p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN); + if (!p) + return ERR_PTR(-ENOMEM); - ret = __do_krealloc(p, new_size, flags); - if (ret && p != ret) + if (copy_from_user(p, src, len)) { kfree(p); + return ERR_PTR(-EFAULT); + } - return ret; + return p; } -EXPORT_SYMBOL(krealloc); +EXPORT_SYMBOL(memdup_user); /** - * kzfree - like kfree but zero memory - * @p: object to free memory of + * vmemdup_user - duplicate memory region from user space * - * The memory of the object @p points to is zeroed before freed. - * If @p is %NULL, kzfree() does nothing. + * @src: source address in user space + * @len: number of bytes to copy * - * Note: this function zeroes the whole allocated buffer which can be a good - * deal bigger than the requested buffer size passed to kmalloc(). So be - * careful when using this function in performance sensitive code. + * Return: an ERR_PTR() on failure. Result may be not + * physically contiguous. Use kvfree() to free. */ -void kzfree(const void *p) +void *vmemdup_user(const void __user *src, size_t len) { - size_t ks; - void *mem = (void *)p; + void *p; - if (unlikely(ZERO_OR_NULL_PTR(mem))) - return; - ks = ksize(mem); - memset(mem, 0, ks); - kfree(mem); + p = kmem_buckets_valloc(user_buckets, len, GFP_USER); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kvfree(p); + return ERR_PTR(-EFAULT); + } + + return p; } -EXPORT_SYMBOL(kzfree); +EXPORT_SYMBOL(vmemdup_user); -/* +/** * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. + * + * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) { @@ -227,184 +288,1183 @@ char *strndup_user(const char __user *s, long n) } EXPORT_SYMBOL(strndup_user); -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node *rb_parent) +/** + * memdup_user_nul - duplicate memory region from user space and NUL-terminate + * + * @src: source address in user space + * @len: number of bytes to copy + * + * Return: an ERR_PTR() on failure. + */ +void *memdup_user_nul(const void __user *src, size_t len) { - struct vm_area_struct *next; + char *p; - vma->vm_prev = prev; - if (prev) { - next = prev->vm_next; - prev->vm_next = vma; - } else { - mm->mmap = vma; - if (rb_parent) - next = rb_entry(rb_parent, - struct vm_area_struct, vm_rb); - else - next = NULL; + p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kfree(p); + return ERR_PTR(-EFAULT); } - vma->vm_next = next; - if (next) - next->vm_prev = vma; + p[len] = '\0'; + + return p; } +EXPORT_SYMBOL(memdup_user_nul); /* Check if the vma is being used as a stack by this task */ -static int vm_is_stack_for_task(struct task_struct *t, - struct vm_area_struct *vma) +int vma_is_stack_for_current(const struct vm_area_struct *vma) { + struct task_struct * __maybe_unused t = current; + return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } /* - * Check if the vma is being used as a stack. - * If is_group is non-zero, check in the entire thread group or else - * just check in the current task. Returns the pid of the task that - * the vma is stack for. - */ -pid_t vm_is_stack(struct task_struct *task, - struct vm_area_struct *vma, int in_group) -{ - pid_t ret = 0; - - if (vm_is_stack_for_task(task, vma)) - return task->pid; - - if (in_group) { - struct task_struct *t; - rcu_read_lock(); - if (!pid_alive(task)) - goto done; - - t = task; - do { - if (vm_is_stack_for_task(t, vma)) { - ret = t->pid; - goto done; - } - } while_each_thread(task, t); -done: - rcu_read_unlock(); + * Change backing file, only valid to use during initial VMA setup. + */ +void vma_set_file(struct vm_area_struct *vma, struct file *file) +{ + /* Changing an anonymous vma with this is illegal */ + get_file(file); + swap(vma->vm_file, file); + fput(file); +} +EXPORT_SYMBOL(vma_set_file); + +#ifndef STACK_RND_MASK +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ +#endif + +unsigned long randomize_stack_top(unsigned long stack_top) +{ + unsigned long random_variable = 0; + + if (current->flags & PF_RANDOMIZE) { + random_variable = get_random_long(); + random_variable &= STACK_RND_MASK; + random_variable <<= PAGE_SHIFT; } +#ifdef CONFIG_STACK_GROWSUP + return PAGE_ALIGN(stack_top) + random_variable; +#else + return PAGE_ALIGN(stack_top) - random_variable; +#endif +} - return ret; +/** + * randomize_page - Generate a random, page aligned address + * @start: The smallest acceptable address the caller will take. + * @range: The size of the area, starting at @start, within which the + * random address must fall. + * + * If @start + @range would overflow, @range is capped. + * + * NOTE: Historical use of randomize_range, which this replaces, presumed that + * @start was already page aligned. We now align it regardless. + * + * Return: A page aligned address within [start, start + range). On error, + * @start is returned. + */ +unsigned long randomize_page(unsigned long start, unsigned long range) +{ + if (!PAGE_ALIGNED(start)) { + range -= PAGE_ALIGN(start) - start; + start = PAGE_ALIGN(start); + } + + if (start > ULONG_MAX - range) + range = ULONG_MAX - start; + + range >>= PAGE_SHIFT; + + if (range == 0) + return start; + + return start + (get_random_long() % range << PAGE_SHIFT); } -#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) -void arch_pick_mmap_layout(struct mm_struct *mm) +#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +unsigned long __weak arch_randomize_brk(struct mm_struct *mm) +{ + /* Is the current task 32bit ? */ + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) + return randomize_page(mm->brk, SZ_32M); + + return randomize_page(mm->brk, SZ_1G); +} + +unsigned long arch_mmap_rnd(void) +{ + unsigned long rnd; + +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + if (is_compat_task()) + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); + else +#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + + return rnd << PAGE_SHIFT; +} + +static int mmap_is_legacy(const struct rlimit *rlim_stack) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + /* On parisc the stack always grows up - so a unlimited stack should + * not be an indicator to use the legacy memory layout. */ + if (rlim_stack->rlim_cur == RLIM_INFINITY && + !IS_ENABLED(CONFIG_STACK_GROWSUP)) + return 1; + + return sysctl_legacy_va_layout; +} + +/* + * Leave enough space between the mmap area and the stack to honour ulimit in + * the face of randomisation. + */ +#define MIN_GAP (SZ_128M) +#define MAX_GAP (STACK_TOP / 6 * 5) + +static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack) +{ +#ifdef CONFIG_STACK_GROWSUP + /* + * For an upwards growing stack the calculation is much simpler. + * Memory for the maximum stack size is reserved at the top of the + * task. mmap_base starts directly below the stack and grows + * downwards. + */ + return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); +#else + unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + + if (gap < MIN_GAP && MIN_GAP < MAX_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return PAGE_ALIGN(STACK_TOP - gap - rnd); +#endif +} + +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) +{ + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = arch_mmap_rnd(); + + if (mmap_is_legacy(rlim_stack)) { + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; + mm_flags_clear(MMF_TOPDOWN, mm); + } else { + mm->mmap_base = mmap_base(random_factor, rlim_stack); + mm_flags_set(MMF_TOPDOWN, mm); + } +} +#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; + mm_flags_clear(MMF_TOPDOWN, mm); } #endif +#ifdef CONFIG_MMU +EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); +#endif -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * If the architecture not support this function, simply return with no - * page pinned +/** + * __account_locked_vm - account locked pages to an mm's locked_vm + * @mm: mm to account against + * @pages: number of pages to account + * @inc: %true if @pages should be considered positive, %false if not + * @task: task used to check RLIMIT_MEMLOCK + * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped + * + * Assumes @task and @mm are valid (i.e. at least one reference on each), and + * that mmap_lock is held as writer. + * + * Return: + * * 0 on success + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ -int __attribute__((weak)) __get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, + const struct task_struct *task, bool bypass_rlim) { - return 0; + unsigned long locked_vm, limit; + int ret = 0; + + mmap_assert_write_locked(mm); + + locked_vm = mm->locked_vm; + if (inc) { + if (!bypass_rlim) { + limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked_vm + pages > limit) + ret = -ENOMEM; + } + if (!ret) + mm->locked_vm = locked_vm + pages; + } else { + WARN_ON_ONCE(pages > locked_vm); + mm->locked_vm = locked_vm - pages; + } + + pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, + (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, + locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + return ret; } -EXPORT_SYMBOL_GPL(__get_user_pages_fast); +EXPORT_SYMBOL_GPL(__account_locked_vm); /** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - * - * get_user_pages_fast provides equivalent functionality to get_user_pages, - * operating on current and current->mm, with force=0 and vma=NULL. However - * unlike get_user_pages, it must be called without mmap_sem held. - * - * get_user_pages_fast may take mmap_sem and page table locks, so no - * assumptions can be made about lack of locking. get_user_pages_fast is to be - * implemented in a way that is advantageous (vs get_user_pages()) when the - * user memory area is already faulted in and present in ptes. However if the - * pages have to be faulted in, it may turn out to be slightly slower so - * callers need to carefully consider what to use. On many architectures, - * get_user_pages_fast simply falls back to get_user_pages. - */ -int __attribute__((weak)) get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) + * account_locked_vm - account locked pages to an mm's locked_vm + * @mm: mm to account against, may be NULL + * @pages: number of pages to account + * @inc: %true if @pages should be considered positive, %false if not + * + * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). + * + * Return: + * * 0 on success, or if mm is NULL + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. + */ +int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) { - struct mm_struct *mm = current->mm; int ret; - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, nr_pages, - write, 0, pages, NULL); - up_read(&mm->mmap_sem); + if (pages == 0 || !mm) + return 0; + + mmap_write_lock(mm); + ret = __account_locked_vm(mm, pages, inc, current, + capable(CAP_IPC_LOCK)); + mmap_write_unlock(mm); return ret; } -EXPORT_SYMBOL_GPL(get_user_pages_fast); +EXPORT_SYMBOL_GPL(account_locked_vm); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { + loff_t off = (loff_t)pgoff << PAGE_SHIFT; unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; + LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); + if (!ret) + ret = fsnotify_mmap_perm(file, prot, off, len); if (!ret) { - down_write(&mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, - &populate); - up_write(&mm->mmap_sem); + if (mmap_write_lock_killable(mm)) + return -EINTR; + ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, + &uf); + mmap_write_unlock(mm); + userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } return ret; } +/* + * Perform a userland memory mapping into the current process address space. See + * the comment for do_mmap() for more details on this operation in general. + * + * This differs from do_mmap() in that: + * + * a. An offset parameter is provided rather than pgoff, which is both checked + * for overflow and page alignment. + * b. mmap locking is performed on the caller's behalf. + * c. Userfaultfd unmap events and memory population are handled. + * + * This means that this function performs essentially the same work as if + * userland were invoking mmap (2). + * + * Returns either an error, or the address at which the requested mapping has + * been performed. + */ unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) { if (unlikely(offset + PAGE_ALIGN(len) < offset)) return -EINVAL; - if (unlikely(offset & ~PAGE_MASK)) + if (unlikely(offset_in_page(offset))) return -EINVAL; return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); } EXPORT_SYMBOL(vm_mmap); -struct address_space *page_mapping(struct page *page) +/** + * __vmalloc_array - allocate memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { - struct address_space *mapping = page->mapping; + size_t bytes; - VM_BUG_ON(PageSlab(page)); -#ifdef CONFIG_SWAP - if (unlikely(PageSwapCache(page))) { - swp_entry_t entry; + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + return __vmalloc_noprof(bytes, flags); +} +EXPORT_SYMBOL(__vmalloc_array_noprof); + +/** + * vmalloc_array - allocate memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + */ +void *vmalloc_array_noprof(size_t n, size_t size) +{ + return __vmalloc_array_noprof(n, size, GFP_KERNEL); +} +EXPORT_SYMBOL(vmalloc_array_noprof); + +/** + * __vcalloc - allocate and zero memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) +{ + return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); +} +EXPORT_SYMBOL(__vcalloc_noprof); + +/** + * vcalloc - allocate and zero memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + */ +void *vcalloc_noprof(size_t n, size_t size) +{ + return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); +} +EXPORT_SYMBOL(vcalloc_noprof); + +struct anon_vma *folio_anon_vma(const struct folio *folio) +{ + unsigned long mapping = (unsigned long)folio->mapping; + + if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) + return NULL; + return (void *)(mapping - FOLIO_MAPPING_ANON); +} + +/** + * folio_mapping - Find the mapping where this folio is stored. + * @folio: The folio. + * + * For folios which are in the page cache, return the mapping that this + * page belongs to. Folios in the swap cache return the swap mapping + * this page is stored in (which is different from the mapping for the + * swap file or swap device where the data is stored). + * + * You can call this for folios which aren't in the swap cache or page + * cache and it will return NULL. + */ +struct address_space *folio_mapping(const struct folio *folio) +{ + struct address_space *mapping; + + /* This happens if someone calls flush_dcache_page on slab page */ + if (unlikely(folio_test_slab(folio))) + return NULL; + + if (unlikely(folio_test_swapcache(folio))) + return swap_address_space(folio->swap); + + mapping = folio->mapping; + if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS) + return NULL; - entry.val = page_private(page); - mapping = swap_address_space(entry); - } else -#endif - if ((unsigned long)mapping & PAGE_MAPPING_ANON) - mapping = NULL; return mapping; } +EXPORT_SYMBOL(folio_mapping); -/* Tracepoints definitions. */ -EXPORT_TRACEPOINT_SYMBOL(kmalloc); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); -EXPORT_TRACEPOINT_SYMBOL(kfree); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +/** + * folio_copy - Copy the contents of one folio to another. + * @dst: Folio to copy to. + * @src: Folio to copy from. + * + * The bytes in the folio represented by @src are copied to @dst. + * Assumes the caller has validated that @dst is at least as large as @src. + * Can be called in atomic context for order-0 folios, but if the folio is + * larger, it may sleep. + */ +void folio_copy(struct folio *dst, struct folio *src) +{ + long i = 0; + long nr = folio_nr_pages(src); + + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; + cond_resched(); + } +} +EXPORT_SYMBOL(folio_copy); + +int folio_mc_copy(struct folio *dst, struct folio *src) +{ + long nr = folio_nr_pages(src); + long i = 0; + + for (;;) { + if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i))) + return -EHWPOISON; + if (++i == nr) + break; + cond_resched(); + } + + return 0; +} +EXPORT_SYMBOL(folio_mc_copy); + +int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; +static int sysctl_overcommit_ratio __read_mostly = 50; +static unsigned long sysctl_overcommit_kbytes __read_mostly; +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ + +#ifdef CONFIG_SYSCTL + +static int overcommit_ratio_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_kbytes = 0; + return ret; +} + +static void sync_overcommit_as(struct work_struct *dummy) +{ + percpu_counter_sync(&vm_committed_as); +} + +static int overcommit_policy_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int new_policy = -1; + int ret; + + /* + * The deviation of sync_overcommit_as could be big with loose policy + * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to + * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply + * with the strict "NEVER", and to avoid possible race condition (even + * though user usually won't too frequently do the switching to policy + * OVERCOMMIT_NEVER), the switch is done in the following order: + * 1. changing the batch + * 2. sync percpu count on each CPU + * 3. switch the policy + */ + if (write) { + t = *table; + t.data = &new_policy; + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (ret || new_policy == -1) + return ret; + + mm_compute_batch(new_policy); + if (new_policy == OVERCOMMIT_NEVER) + schedule_on_each_cpu(sync_overcommit_as); + sysctl_overcommit_memory = new_policy; + } else { + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + } + + return ret; +} + +static int overcommit_kbytes_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_ratio = 0; + return ret; +} + +static const struct ctl_table util_sysctl_table[] = { + { + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = overcommit_policy_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, + }, + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +}; + +static int __init init_vm_util_sysctls(void) +{ + register_sysctl_init("vm", util_sysctl_table); + return 0; +} +subsys_initcall(init_vm_util_sysctls); +#endif /* CONFIG_SYSCTL */ + +/* + * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used + */ +unsigned long vm_commit_limit(void) +{ + unsigned long allowed; + + if (sysctl_overcommit_kbytes) + allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); + else + allowed = ((totalram_pages() - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100); + allowed += total_swap_pages; + + return allowed; +} + +/* + * Make sure vm_committed_as in one cacheline and not cacheline shared with + * other variables. It can be updated by several CPUs frequently. + */ +struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; + +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + * + * The time cost of this is very low for small platforms, and for big + * platform like a 2S/36C/72T Skylake server, in worst case where + * vm_committed_as's spinlock is under severe contention, the time cost + * could be about 30~40 microseconds. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_sum_positive(&vm_committed_as); +} +EXPORT_SYMBOL_GPL(vm_memory_committed); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin) +{ + long allowed; + unsigned long bytes_failed; + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + if (pages > totalram_pages() + total_swap_pages) + goto error; + return 0; + } + + allowed = vm_commit_limit(); + /* + * Reserve some for root + */ + if (!cap_sys_admin) + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + + allowed -= min_t(long, mm->total_vm / 32, reserve); + } + + if (percpu_counter_read_positive(&vm_committed_as) < allowed) + return 0; +error: + bytes_failed = pages << PAGE_SHIFT; + pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", + __func__, current->pid, current->comm, bytes_failed); + vm_unacct_memory(pages); + + return -ENOMEM; +} + +/** + * get_cmdline() - copy the cmdline value to a buffer. + * @task: the task whose cmdline value to copy. + * @buffer: the buffer to copy to. + * @buflen: the length of the buffer. Larger cmdline values are truncated + * to this length. + * + * Return: the size of the cmdline field copied. Note that the copy does + * not guarantee an ending NULL byte. + */ +int get_cmdline(struct task_struct *task, char *buffer, int buflen) +{ + int res = 0; + unsigned int len; + struct mm_struct *mm = get_task_mm(task); + unsigned long arg_start, arg_end, env_start, env_end; + if (!mm) + goto out; + if (!mm->arg_end) + goto out_mm; /* Shh! No looking before we're done */ + + spin_lock(&mm->arg_lock); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + spin_unlock(&mm->arg_lock); + + len = arg_end - arg_start; + + if (len > buflen) + len = buflen; + + res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); + + /* + * If the nul at the end of args has been overwritten, then + * assume application is using setproctitle(3). + */ + if (res > 0 && buffer[res-1] != '\0' && len < buflen) { + len = strnlen(buffer, res); + if (len < res) { + res = len; + } else { + len = env_end - env_start; + if (len > buflen - res) + len = buflen - res; + res += access_process_vm(task, env_start, + buffer+res, len, + FOLL_FORCE); + res = strnlen(buffer, res); + } + } +out_mm: + mmput(mm); +out: + return res; +} + +int __weak memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_local_page(page1); + addr2 = kmap_local_page(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_local(addr2); + kunmap_local(addr1); + return ret; +} + +#ifdef CONFIG_PRINTK +/** + * mem_dump_obj - Print available provenance information + * @object: object for which to find provenance information. + * + * This function uses pr_cont(), so that the caller is expected to have + * printed out whatever preamble is appropriate. The provenance information + * depends on the type of object and on how much debugging is enabled. + * For example, for a slab-cache object, the slab name is printed, and, + * if available, the return address and stack trace from the allocation + * and last free path of that object. + */ +void mem_dump_obj(void *object) +{ + const char *type; + + if (kmem_dump_obj(object)) + return; + + if (vmalloc_dump_obj(object)) + return; + + if (is_vmalloc_addr(object)) + type = "vmalloc memory"; + else if (virt_addr_valid(object)) + type = "non-slab/vmalloc memory"; + else if (object == NULL) + type = "NULL pointer"; + else if (object == ZERO_SIZE_PTR) + type = "zero-size pointer"; + else + type = "non-paged memory"; + + pr_cont(" %s\n", type); +} +EXPORT_SYMBOL_GPL(mem_dump_obj); +#endif + +/* + * A driver might set a page logically offline -- PageOffline() -- and + * turn the page inaccessible in the hypervisor; after that, access to page + * content can be fatal. + * + * Some special PFN walkers -- i.e., /proc/kcore -- read content of random + * pages after checking PageOffline(); however, these PFN walkers can race + * with drivers that set PageOffline(). + * + * page_offline_freeze()/page_offline_thaw() allows for a subsystem to + * synchronize with such drivers, achieving that a page cannot be set + * PageOffline() while frozen. + * + * page_offline_begin()/page_offline_end() is used by drivers that care about + * such races when setting a page PageOffline(). + */ +static DECLARE_RWSEM(page_offline_rwsem); + +void page_offline_freeze(void) +{ + down_read(&page_offline_rwsem); +} + +void page_offline_thaw(void) +{ + up_read(&page_offline_rwsem); +} + +void page_offline_begin(void) +{ + down_write(&page_offline_rwsem); +} +EXPORT_SYMBOL(page_offline_begin); + +void page_offline_end(void) +{ + up_write(&page_offline_rwsem); +} +EXPORT_SYMBOL(page_offline_end); + +#ifndef flush_dcache_folio +void flush_dcache_folio(struct folio *folio) +{ + long i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) + flush_dcache_page(folio_page(folio, i)); +} +EXPORT_SYMBOL(flush_dcache_folio); +#endif + +/** + * __compat_vma_mmap() - See description for compat_vma_mmap() + * for details. This is the same operation, only with a specific file operations + * struct which may or may not be the same as vma->vm_file->f_op. + * @f_op: The file operations whose .mmap_prepare() hook is specified. + * @file: The file which backs or will back the mapping. + * @vma: The VMA to apply the .mmap_prepare() hook to. + * Returns: 0 on success or error. + */ +int __compat_vma_mmap(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ + }; + int err; + + err = f_op->mmap_prepare(&desc); + if (err) + return err; + + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); +} +EXPORT_SYMBOL(__compat_vma_mmap); + +/** + * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an + * existing VMA and execute any requested actions. + * @file: The file which possesss an f_op->mmap_prepare() hook. + * @vma: The VMA to apply the .mmap_prepare() hook to. + * + * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain + * stacked filesystems invoke a nested mmap hook of an underlying file. + * + * Until all filesystems are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these stacked filesystems using the + * deprecated .mmap() hook. + * + * However we have a problem if the underlying file system possesses an + * .mmap_prepare() hook, as we are in a different context when we invoke the + * .mmap() hook, already having a VMA to deal with. + * + * compat_vma_mmap() is a compatibility function that takes VMA state, + * establishes a struct vm_area_desc descriptor, passes to the underlying + * .mmap_prepare() hook and applies any changes performed by it. + * + * Once the conversion of filesystems is complete this function will no longer + * be required and will be removed. + * + * Returns: 0 on success or error. + */ +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) +{ + return __compat_vma_mmap(file->f_op, file, vma); +} +EXPORT_SYMBOL(compat_vma_mmap); + +static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, + const struct page *page) +{ + /* + * Only the first page of a high-order buddy page has PageBuddy() set. + * So we have to check manually whether this page is part of a high- + * order buddy page. + */ + if (PageBuddy(page)) + ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; + else if (page_count(page) == 0 && is_free_buddy_page(page)) + ps->flags |= PAGE_SNAPSHOT_PG_BUDDY; + + if (folio_test_idle(folio)) + ps->flags |= PAGE_SNAPSHOT_PG_IDLE; +} + +/** + * snapshot_page() - Create a snapshot of a struct page + * @ps: Pointer to a struct page_snapshot to store the page snapshot + * @page: The page to snapshot + * + * Create a snapshot of the page and store both its struct page and struct + * folio representations in @ps. + * + * A snapshot is marked as "faithful" if the compound state of @page was + * stable and allowed safe reconstruction of the folio representation. In + * rare cases where this is not possible (e.g. due to folio splitting), + * snapshot_page() falls back to treating @page as a single page and the + * snapshot is marked as "unfaithful". The snapshot_page_is_faithful() + * helper can be used to check for this condition. + */ +void snapshot_page(struct page_snapshot *ps, const struct page *page) +{ + unsigned long head, nr_pages = 1; + struct folio *foliop; + int loops = 5; + + ps->pfn = page_to_pfn(page); + ps->flags = PAGE_SNAPSHOT_FAITHFUL; + +again: + memset(&ps->folio_snapshot, 0, sizeof(struct folio)); + memcpy(&ps->page_snapshot, page, sizeof(*page)); + head = ps->page_snapshot.compound_head; + if ((head & 1) == 0) { + ps->idx = 0; + foliop = (struct folio *)&ps->page_snapshot; + if (!folio_test_large(foliop)) { + set_ps_flags(ps, page_folio(page), page); + memcpy(&ps->folio_snapshot, foliop, + sizeof(struct page)); + return; + } + foliop = (struct folio *)page; + } else { + foliop = (struct folio *)(head - 1); + ps->idx = folio_page_idx(foliop, page); + } + + if (ps->idx < MAX_FOLIO_NR_PAGES) { + memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page)); + nr_pages = folio_nr_pages(&ps->folio_snapshot); + if (nr_pages > 1) + memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2, + sizeof(struct page)); + set_ps_flags(ps, foliop, page); + } + + if (ps->idx > nr_pages) { + if (loops-- > 0) + goto again; + clear_compound_head(&ps->page_snapshot); + foliop = (struct folio *)&ps->page_snapshot; + memcpy(&ps->folio_snapshot, foliop, sizeof(struct page)); + ps->flags = 0; + ps->idx = 0; + } +} + +static int mmap_action_finish(struct mmap_action *action, + const struct vm_area_struct *vma, int err) +{ + /* + * If an error occurs, unmap the VMA altogether and return an error. We + * only clear the newly allocated VMA, since this function is only + * invoked if we do NOT merge, so we only clean up the VMA we created. + */ + if (err) { + const size_t len = vma_pages(vma) << PAGE_SHIFT; + + do_munmap(current->mm, vma->vm_start, len, NULL); + + if (action->error_hook) { + /* We may want to filter the error. */ + err = action->error_hook(err); + + /* The caller should not clear the error. */ + VM_WARN_ON_ONCE(!err); + } + return err; + } + + if (action->success_hook) + return action->success_hook(vma); + + return 0; +} + +#ifdef CONFIG_MMU +/** + * mmap_action_prepare - Perform preparatory setup for an VMA descriptor + * action which need to be performed. + * @desc: The VMA descriptor to prepare for @action. + * @action: The action to perform. + */ +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + remap_pfn_range_prepare(desc, action->remap.start_pfn); + break; + case MMAP_IO_REMAP_PFN: + io_remap_pfn_range_prepare(desc, action->remap.start_pfn, + action->remap.size); + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +/** + * mmap_action_complete - Execute VMA descriptor action. + * @action: The action to perform. + * @vma: The VMA to perform the action upon. + * + * Similar to mmap_action_prepare(). + * + * Return: 0 on success, or error, at which point the VMA will be unmapped. + */ +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + err = remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + case MMAP_IO_REMAP_PFN: + err = io_remap_pfn_range_complete(vma, action->remap.start, + action->remap.start_pfn, action->remap.size, + action->remap.pgprot); + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#else +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle these. */ + break; + } +} +EXPORT_SYMBOL(mmap_action_prepare); + +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + int err = 0; + + switch (action->type) { + case MMAP_NOTHING: + break; + case MMAP_REMAP_PFN: + case MMAP_IO_REMAP_PFN: + WARN_ON_ONCE(1); /* nommu cannot handle this. */ + + err = -EINVAL; + break; + } + + return mmap_action_finish(action, vma, err); +} +EXPORT_SYMBOL(mmap_action_complete); +#endif + +#ifdef CONFIG_MMU +/** + * folio_pte_batch - detect a PTE batch for a large folio + * @folio: The large folio to detect a PTE batch for. + * @ptep: Page table pointer for the first entry. + * @pte: Page table entry for the first page. + * @max_nr: The maximum number of table entries to consider. + * + * This is a simplified variant of folio_pte_batch_flags(). + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same large folio in a single VMA and a single page table. + * + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, + * the accessed bit, writable bit, dirt-bit and soft-dirty bit. + * + * ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single VMA and + * a single page table. + * + * Return: the number of table entries in the batch. + */ +unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, + unsigned int max_nr) +{ + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); +} +#endif /* CONFIG_MMU */ + +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/** + * page_range_contiguous - test whether the page range is contiguous + * @page: the start of the page range. + * @nr_pages: the number of pages in the range. + * + * Test whether the page range is contiguous, such that they can be iterated + * naively, corresponding to iterating a contiguous PFN range. + * + * This function should primarily only be used for debug checks, or when + * working with page ranges that are not naturally contiguous (e.g., pages + * within a folio are). + * + * Returns true if contiguous, otherwise false. + */ +bool page_range_contiguous(const struct page *page, unsigned long nr_pages) +{ + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + + /* + * The memmap is allocated per memory section, so no need to check + * within the first section. However, we need to check each other + * spanned memory section once, making sure the first page in a + * section could similarly be reached by just iterating pages. + */ + for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION); + pfn < end_pfn; pfn += PAGES_PER_SECTION) + if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn))) + return false; + return true; +} +EXPORT_SYMBOL(page_range_contiguous); +#endif |
