diff options
Diffstat (limited to 'mm/userfaultfd.c')
| -rw-r--r-- | mm/userfaultfd.c | 2120 |
1 files changed, 2120 insertions, 0 deletions
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c new file mode 100644 index 000000000000..e6dfd5f28acd --- /dev/null +++ b/mm/userfaultfd.c @@ -0,0 +1,2120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/userfaultfd.c + * + * Copyright (C) 2015 Red Hat, Inc. + */ + +#include <linux/mm.h> +#include <linux/sched/signal.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/leafops.h> +#include <linux/userfaultfd_k.h> +#include <linux/mmu_notifier.h> +#include <linux/hugetlb.h> +#include <linux/shmem_fs.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include "internal.h" +#include "swap.h" + +static __always_inline +bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) +{ + /* Make sure that the dst range is fully within dst_vma. */ + if (dst_end > dst_vma->vm_end) + return false; + + /* + * Check the vma is registered in uffd, this is required to + * enforce the VM_MAYWRITE check done at uffd registration + * time. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + return false; + + return true; +} + +static __always_inline +struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, + unsigned long addr) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + vma = vma_lookup(mm, addr); + if (!vma) + vma = ERR_PTR(-ENOENT); + else if (!(vma->vm_flags & VM_SHARED) && + unlikely(anon_vma_prepare(vma))) + vma = ERR_PTR(-ENOMEM); + + return vma; +} + +#ifdef CONFIG_PER_VMA_LOCK +/* + * uffd_lock_vma() - Lookup and lock vma corresponding to @address. + * @mm: mm to search vma in. + * @address: address that the vma should contain. + * + * Should be called without holding mmap_lock. + * + * Return: A locked vma containing @address, -ENOENT if no vma is found, or + * -ENOMEM if anon_vma couldn't be allocated. + */ +static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, + unsigned long address) +{ + struct vm_area_struct *vma; + + vma = lock_vma_under_rcu(mm, address); + if (vma) { + /* + * We know we're going to need to use anon_vma, so check + * that early. + */ + if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) + vma_end_read(vma); + else + return vma; + } + + mmap_read_lock(mm); + vma = find_vma_and_prepare_anon(mm, address); + if (!IS_ERR(vma)) { + bool locked = vma_start_read_locked(vma); + + if (!locked) + vma = ERR_PTR(-EAGAIN); + } + + mmap_read_unlock(mm); + return vma; +} + +static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len) +{ + struct vm_area_struct *dst_vma; + + dst_vma = uffd_lock_vma(dst_mm, dst_start); + if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) + return dst_vma; + + vma_end_read(dst_vma); + return ERR_PTR(-ENOENT); +} + +static void uffd_mfill_unlock(struct vm_area_struct *vma) +{ + vma_end_read(vma); +} + +#else + +static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len) +{ + struct vm_area_struct *dst_vma; + + mmap_read_lock(dst_mm); + dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); + if (IS_ERR(dst_vma)) + goto out_unlock; + + if (validate_dst_vma(dst_vma, dst_start + len)) + return dst_vma; + + dst_vma = ERR_PTR(-ENOENT); +out_unlock: + mmap_read_unlock(dst_mm); + return dst_vma; +} + +static void uffd_mfill_unlock(struct vm_area_struct *vma) +{ + mmap_read_unlock(vma->vm_mm); +} +#endif + +/* Check if dst_addr is outside of file's size. Must be called with ptl held. */ +static bool mfill_file_over_size(struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct inode *inode; + pgoff_t offset, max_off; + + if (!dst_vma->vm_file) + return false; + + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + return offset >= max_off; +} + +/* + * Install PTEs, to map dst_addr (within dst_vma) to page. + * + * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem + * and anon, and for both shared and private VMAs. + */ +int mfill_atomic_install_pte(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, struct page *page, + bool newly_allocated, uffd_flags_t flags) +{ + int ret; + struct mm_struct *dst_mm = dst_vma->vm_mm; + pte_t _dst_pte, *dst_pte; + bool writable = dst_vma->vm_flags & VM_WRITE; + bool vm_shared = dst_vma->vm_flags & VM_SHARED; + spinlock_t *ptl; + struct folio *folio = page_folio(page); + bool page_in_cache = folio_mapping(folio); + pte_t dst_ptep; + + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + _dst_pte = pte_mkdirty(_dst_pte); + if (page_in_cache && !vm_shared) + writable = false; + if (writable) + _dst_pte = pte_mkwrite(_dst_pte, dst_vma); + if (flags & MFILL_ATOMIC_WP) + _dst_pte = pte_mkuffd_wp(_dst_pte); + + ret = -EAGAIN; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!dst_pte) + goto out; + + if (mfill_file_over_size(dst_vma, dst_addr)) { + ret = -EFAULT; + goto out_unlock; + } + + ret = -EEXIST; + + dst_ptep = ptep_get(dst_pte); + + /* + * We are allowed to overwrite a UFFD pte marker: consider when both + * MISSING|WP registered, we firstly wr-protect a none pte which has no + * page cache page backing it, then access the page. + */ + if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) + goto out_unlock; + + if (page_in_cache) { + /* Usually, cache pages are already added to LRU */ + if (newly_allocated) + folio_add_lru(folio); + folio_add_file_rmap_pte(folio, page, dst_vma); + } else { + folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, dst_vma); + } + + /* + * Must happen after rmap, as mm_counter() checks mapping (via + * PageAnon()), which is set by __page_set_anon_rmap(). + */ + inc_mm_counter(dst_mm, mm_counter(folio)); + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); +out: + return ret; +} + +static int mfill_atomic_pte_copy(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + uffd_flags_t flags, + struct folio **foliop) +{ + void *kaddr; + int ret; + struct folio *folio; + + if (!*foliop) { + ret = -ENOMEM; + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, + dst_addr); + if (!folio) + goto out; + + kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); + ret = copy_from_user(kaddr, (const void __user *) src_addr, + PAGE_SIZE); + pagefault_enable(); + kunmap_local(kaddr); + + /* fallback to copy_from_user outside mmap_lock */ + if (unlikely(ret)) { + ret = -ENOENT; + *foliop = folio; + /* don't free the page */ + goto out; + } + + flush_dcache_folio(folio); + } else { + folio = *foliop; + *foliop = NULL; + } + + /* + * The memory barrier inside __folio_mark_uptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __folio_mark_uptodate(folio); + + ret = -ENOMEM; + if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) + goto out_release; + + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, + &folio->page, true, flags); + if (ret) + goto out_release; +out: + return ret; +out_release: + folio_put(folio); + goto out; +} + +static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct folio *folio; + int ret = -ENOMEM; + + folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); + if (!folio) + return ret; + + if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) + goto out_put; + + /* + * The memory barrier inside __folio_mark_uptodate makes sure that + * zeroing out the folio become visible before mapping the page + * using set_pte_at(). See do_anonymous_page(). + */ + __folio_mark_uptodate(folio); + + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, + &folio->page, true, 0); + if (ret) + goto out_put; + + return 0; +out_put: + folio_put(folio); + return ret; +} + +static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + int ret; + + if (mm_forbids_zeropage(dst_vma->vm_mm)) + return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); + + _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + dst_vma->vm_page_prot)); + ret = -EAGAIN; + dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); + if (!dst_pte) + goto out; + if (mfill_file_over_size(dst_vma, dst_addr)) { + ret = -EFAULT; + goto out_unlock; + } + ret = -EEXIST; + if (!pte_none(ptep_get(dst_pte))) + goto out_unlock; + set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); +out: + return ret; +} + +/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ +static int mfill_atomic_pte_continue(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + uffd_flags_t flags) +{ + struct inode *inode = file_inode(dst_vma->vm_file); + pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct folio *folio; + struct page *page; + int ret; + + ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + /* Our caller expects us to return -EFAULT if we failed to find folio */ + if (ret == -ENOENT) + ret = -EFAULT; + if (ret) + goto out; + if (!folio) { + ret = -EFAULT; + goto out; + } + + page = folio_file_page(folio, pgoff); + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; + } + + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, + page, false, flags); + if (ret) + goto out_release; + + folio_unlock(folio); + ret = 0; +out: + return ret; +out_release: + folio_unlock(folio); + folio_put(folio); + goto out; +} + +/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ +static int mfill_atomic_pte_poison(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + uffd_flags_t flags) +{ + int ret; + struct mm_struct *dst_mm = dst_vma->vm_mm; + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + + _dst_pte = make_pte_marker(PTE_MARKER_POISONED); + ret = -EAGAIN; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!dst_pte) + goto out; + + if (mfill_file_over_size(dst_vma, dst_addr)) { + ret = -EFAULT; + goto out_unlock; + } + + ret = -EEXIST; + /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ + if (!pte_none(ptep_get(dst_pte))) + goto out_unlock; + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); +out: + return ret; +} + +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, address); + if (!pud) + return NULL; + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + return pmd_alloc(mm, pud, address); +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * mfill_atomic processing for HUGETLB vmas. Note that this routine is + * called with either vma-lock or mmap_lock held, it will release the lock + * before returning. + */ +static __always_inline ssize_t mfill_atomic_hugetlb( + struct userfaultfd_ctx *ctx, + struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + uffd_flags_t flags) +{ + struct mm_struct *dst_mm = dst_vma->vm_mm; + ssize_t err; + pte_t *dst_pte; + unsigned long src_addr, dst_addr; + long copied; + struct folio *folio; + unsigned long vma_hpagesize; + pgoff_t idx; + u32 hash; + struct address_space *mapping; + + /* + * There is no default zero huge page for all huge page sizes as + * supported by hugetlb. A PMD_SIZE huge pages may exist as used + * by THP. Since we can not reliably insert a zero page, this + * feature is not supported. + */ + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { + up_read(&ctx->map_changing_lock); + uffd_mfill_unlock(dst_vma); + return -EINVAL; + } + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + folio = NULL; + vma_hpagesize = vma_kernel_pagesize(dst_vma); + + /* + * Validate alignment based on huge page size + */ + err = -EINVAL; + if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) + goto out_unlock; + +retry: + /* + * On routine entry dst_vma is set. If we had to drop mmap_lock and + * retry, dst_vma will be set to NULL and we must lookup again. + */ + if (!dst_vma) { + dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); + if (IS_ERR(dst_vma)) { + err = PTR_ERR(dst_vma); + goto out; + } + + err = -ENOENT; + if (!is_vm_hugetlb_page(dst_vma)) + goto out_unlock_vma; + + err = -EINVAL; + if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) + goto out_unlock_vma; + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + down_read(&ctx->map_changing_lock); + err = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out_unlock; + } + + while (src_addr < src_start + len) { + VM_WARN_ON_ONCE(dst_addr >= dst_start + len); + + /* + * Serialize via vma_lock and hugetlb_fault_mutex. + * vma_lock ensures the dst_pte remains valid even + * in the case of shared pmds. fault mutex prevents + * races with other faulting threads. + */ + idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(dst_vma); + + err = -ENOMEM; + dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); + if (!dst_pte) { + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } + + if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { + const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); + + if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { + err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } + } + + err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, + src_addr, flags, &folio); + + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + cond_resched(); + + if (unlikely(err == -ENOENT)) { + up_read(&ctx->map_changing_lock); + uffd_mfill_unlock(dst_vma); + VM_WARN_ON_ONCE(!folio); + + err = copy_folio_from_user(folio, + (const void __user *)src_addr, true); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + + dst_vma = NULL; + goto retry; + } else + VM_WARN_ON_ONCE(folio); + + if (!err) { + dst_addr += vma_hpagesize; + src_addr += vma_hpagesize; + copied += vma_hpagesize; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + up_read(&ctx->map_changing_lock); +out_unlock_vma: + uffd_mfill_unlock(dst_vma); +out: + if (folio) + folio_put(folio); + VM_WARN_ON_ONCE(copied < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!copied && !err); + return copied ? copied : err; +} +#else /* !CONFIG_HUGETLB_PAGE */ +/* fail at build time if gcc attempts to use this */ +extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, + struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + uffd_flags_t flags); +#endif /* CONFIG_HUGETLB_PAGE */ + +static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + uffd_flags_t flags, + struct folio **foliop) +{ + ssize_t err; + + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { + return mfill_atomic_pte_continue(dst_pmd, dst_vma, + dst_addr, flags); + } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { + return mfill_atomic_pte_poison(dst_pmd, dst_vma, + dst_addr, flags); + } + + /* + * The normal page fault path for a shmem will invoke the + * fault, fill the hole in the file and COW it right away. The + * result generates plain anonymous memory. So when we are + * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll + * generate anonymous memory directly without actually filling + * the hole. For the MAP_PRIVATE case the robustness check + * only happens in the pagetable (to verify it's still none) + * and not in the radix tree. + */ + if (!(dst_vma->vm_flags & VM_SHARED)) { + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) + err = mfill_atomic_pte_copy(dst_pmd, dst_vma, + dst_addr, src_addr, + flags, foliop); + else + err = mfill_atomic_pte_zeropage(dst_pmd, + dst_vma, dst_addr); + } else { + err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, + dst_addr, src_addr, + flags, foliop); + } + + return err; +} + +static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + uffd_flags_t flags) +{ + struct mm_struct *dst_mm = ctx->mm; + struct vm_area_struct *dst_vma; + ssize_t err; + pmd_t *dst_pmd; + unsigned long src_addr, dst_addr; + long copied; + struct folio *folio; + + /* + * Sanitize the command parameters: + */ + VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + VM_WARN_ON_ONCE(src_start + len <= src_start); + VM_WARN_ON_ONCE(dst_start + len <= dst_start); + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + folio = NULL; +retry: + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); + if (IS_ERR(dst_vma)) { + err = PTR_ERR(dst_vma); + goto out; + } + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + down_read(&ctx->map_changing_lock); + err = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out_unlock; + + err = -EINVAL; + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + + /* + * validate 'mode' now that we know the dst_vma: don't allow + * a wrprotect copy if the userfaultfd didn't register as WP. + */ + if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) + goto out_unlock; + + /* + * If this is a HUGETLB vma, pass off to appropriate routine + */ + if (is_vm_hugetlb_page(dst_vma)) + return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + src_start, len, flags); + + if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) + goto out_unlock; + if (!vma_is_shmem(dst_vma) && + uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + goto out_unlock; + + while (src_addr < src_start + len) { + pmd_t dst_pmdval; + + VM_WARN_ON_ONCE(dst_addr >= dst_start + len); + + dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); + if (unlikely(!dst_pmd)) { + err = -ENOMEM; + break; + } + + dst_pmdval = pmdp_get_lockless(dst_pmd); + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_pmd))) { + err = -ENOMEM; + break; + } + dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is THP don't override it and just be strict. + * (This includes the case where the PMD used to be THP and + * changed back to none after __pte_alloc().) + */ + if (unlikely(!pmd_present(dst_pmdval) || + pmd_trans_huge(dst_pmdval))) { + err = -EEXIST; + break; + } + if (unlikely(pmd_bad(dst_pmdval))) { + err = -EFAULT; + break; + } + /* + * For shmem mappings, khugepaged is allowed to remove page + * tables under us; pte_offset_map_lock() will deal with that. + */ + + err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, + src_addr, flags, &folio); + cond_resched(); + + if (unlikely(err == -ENOENT)) { + void *kaddr; + + up_read(&ctx->map_changing_lock); + uffd_mfill_unlock(dst_vma); + VM_WARN_ON_ONCE(!folio); + + kaddr = kmap_local_folio(folio, 0); + err = copy_from_user(kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + kunmap_local(kaddr); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + flush_dcache_folio(folio); + goto retry; + } else + VM_WARN_ON_ONCE(folio); + + if (!err) { + dst_addr += PAGE_SIZE; + src_addr += PAGE_SIZE; + copied += PAGE_SIZE; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + up_read(&ctx->map_changing_lock); + uffd_mfill_unlock(dst_vma); +out: + if (folio) + folio_put(folio); + VM_WARN_ON_ONCE(copied < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!copied && !err); + return copied ? copied : err; +} + +ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, + uffd_flags_t flags) +{ + return mfill_atomic(ctx, dst_start, src_start, len, + uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); +} + +ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, + unsigned long start, + unsigned long len) +{ + return mfill_atomic(ctx, start, 0, len, + uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); +} + +ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, uffd_flags_t flags) +{ + + /* + * A caller might reasonably assume that UFFDIO_CONTINUE contains an + * smp_wmb() to ensure that any writes to the about-to-be-mapped page by + * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to + * subsequent loads from the page through the newly mapped address range. + */ + smp_wmb(); + + return mfill_atomic(ctx, start, 0, len, + uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); +} + +ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, uffd_flags_t flags) +{ + return mfill_atomic(ctx, start, 0, len, + uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); +} + +long uffd_wp_range(struct vm_area_struct *dst_vma, + unsigned long start, unsigned long len, bool enable_wp) +{ + unsigned int mm_cp_flags; + struct mmu_gather tlb; + long ret; + + VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, + "The address range exceeds VMA boundary.\n"); + if (enable_wp) + mm_cp_flags = MM_CP_UFFD_WP; + else + mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; + + /* + * vma->vm_page_prot already reflects that uffd-wp is enabled for this + * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed + * to be write-protected as default whenever protection changes. + * Try upgrading write permissions manually. + */ + if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) + mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; + tlb_gather_mmu(&tlb, dst_vma->vm_mm); + ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); + tlb_finish_mmu(&tlb); + + return ret; +} + +int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, bool enable_wp) +{ + struct mm_struct *dst_mm = ctx->mm; + unsigned long end = start + len; + unsigned long _start, _end; + struct vm_area_struct *dst_vma; + unsigned long page_mask; + long err; + VMA_ITERATOR(vmi, dst_mm, start); + + /* + * Sanitize the command parameters: + */ + VM_WARN_ON_ONCE(start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + VM_WARN_ON_ONCE(start + len <= start); + + mmap_read_lock(dst_mm); + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + down_read(&ctx->map_changing_lock); + err = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out_unlock; + + err = -ENOENT; + for_each_vma_range(vmi, dst_vma, end) { + + if (!userfaultfd_wp(dst_vma)) { + err = -ENOENT; + break; + } + + if (is_vm_hugetlb_page(dst_vma)) { + err = -EINVAL; + page_mask = vma_kernel_pagesize(dst_vma) - 1; + if ((start & page_mask) || (len & page_mask)) + break; + } + + _start = max(dst_vma->vm_start, start); + _end = min(dst_vma->vm_end, end); + + err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); + + /* Return 0 on success, <0 on failures */ + if (err < 0) + break; + err = 0; + } +out_unlock: + up_read(&ctx->map_changing_lock); + mmap_read_unlock(dst_mm); + return err; +} + + +void double_pt_lock(spinlock_t *ptl1, + spinlock_t *ptl2) + __acquires(ptl1) + __acquires(ptl2) +{ + if (ptl1 > ptl2) + swap(ptl1, ptl2); + /* lock in virtual address order to avoid lock inversion */ + spin_lock(ptl1); + if (ptl1 != ptl2) + spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); + else + __acquire(ptl2); +} + +void double_pt_unlock(spinlock_t *ptl1, + spinlock_t *ptl2) + __releases(ptl1) + __releases(ptl2) +{ + spin_unlock(ptl1); + if (ptl1 != ptl2) + spin_unlock(ptl2); + else + __release(ptl2); +} + +static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval) +{ + return pte_same(ptep_get(src_pte), orig_src_pte) && + pte_same(ptep_get(dst_pte), orig_dst_pte) && + pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); +} + +/* + * Checks if the two ptes and the corresponding folio are eligible for batched + * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. + * + * NOTE: folio's reference is not required as the whole operation is within + * PTL's critical section. + */ +static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, + unsigned long src_addr, + pte_t *src_pte, pte_t *dst_pte) +{ + pte_t orig_dst_pte, orig_src_pte; + struct folio *folio; + + orig_dst_pte = ptep_get(dst_pte); + if (!pte_none(orig_dst_pte)) + return NULL; + + orig_src_pte = ptep_get(src_pte); + if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) + return NULL; + + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); + if (!folio || !folio_trylock(folio)) + return NULL; + if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { + folio_unlock(folio); + return NULL; + } + return folio; +} + +/* + * Moves src folios to dst in a batch as long as they are not large, and can + * successfully take the lock via folio_trylock(). + */ +static long move_present_ptes(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio **first_src_folio, unsigned long len) +{ + int err = 0; + struct folio *src_folio = *first_src_folio; + unsigned long src_start = src_addr; + unsigned long src_end; + + len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; + src_end = pmd_addr_end(src_addr, src_addr + len); + flush_cache_range(src_vma, src_addr, src_end); + double_pt_lock(dst_ptl, src_ptl); + + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { + err = -EAGAIN; + goto out; + } + if (folio_test_large(src_folio) || + folio_maybe_dma_pinned(src_folio) || + !PageAnonExclusive(&src_folio->page)) { + err = -EBUSY; + goto out; + } + /* It's safe to drop the reference now as the page-table is holding one. */ + folio_put(*first_src_folio); + *first_src_folio = NULL; + arch_enter_lazy_mmu_mode(); + + while (true) { + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); + /* Folio got pinned from under us. Put it back and fail the move. */ + if (folio_maybe_dma_pinned(src_folio)) { + set_pte_at(mm, src_addr, src_pte, orig_src_pte); + err = -EBUSY; + break; + } + + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); + + orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); + /* Set soft dirty bit so userspace can notice the pte was moved */ + if (pgtable_supports_soft_dirty()) + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); + if (pte_dirty(orig_src_pte)) + orig_dst_pte = pte_mkdirty(orig_dst_pte); + orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); + set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); + + src_addr += PAGE_SIZE; + if (src_addr == src_end) + break; + dst_addr += PAGE_SIZE; + dst_pte++; + src_pte++; + + folio_unlock(src_folio); + src_folio = check_ptes_for_batched_move(src_vma, src_addr, + src_pte, dst_pte); + if (!src_folio) + break; + } + + arch_leave_lazy_mmu_mode(); + if (src_addr > src_start) + flush_tlb_range(src_vma, src_start, src_addr); + + if (src_folio) + folio_unlock(src_folio); +out: + double_pt_unlock(dst_ptl, src_ptl); + return src_addr > src_start ? src_addr - src_start : err; +} + +static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio *src_folio, + struct swap_info_struct *si, swp_entry_t entry) +{ + /* + * Check if the folio still belongs to the target swap entry after + * acquiring the lock. Folio can be freed in the swap cache while + * not locked. + */ + if (src_folio && unlikely(!folio_test_swapcache(src_folio) || + entry.val != src_folio->swap.val)) + return -EAGAIN; + + double_pt_lock(dst_ptl, src_ptl); + + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } + + /* + * The src_folio resides in the swapcache, requiring an update to its + * index and mapping to align with the dst_vma, where a swap-in may + * occur and hit the swapcache after moving the PTE. + */ + if (src_folio) { + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); + } else { + /* + * Check if the swap entry is cached after acquiring the src_pte + * lock. Otherwise, we might miss a newly loaded swap cache folio. + * + * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. + * We are trying to catch newly added swap cache, the only possible case is + * when a folio is swapped in and out again staying in swap cache, using the + * same entry before the PTE check above. The PTL is acquired and released + * twice, each time after updating the swap_map's flag. So holding + * the PTL here ensures we see the updated value. False positive is possible, + * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the + * cache, or during the tiny synchronization window between swap cache and + * swap_map, but it will be gone very quickly, worst result is retry jitters. + */ + if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } + } + + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); + if (pgtable_supports_soft_dirty()) + orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); + set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); + double_pt_unlock(dst_ptl, src_ptl); + + return PAGE_SIZE; +} + +static int move_zeropage_pte(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, + spinlock_t *dst_ptl, spinlock_t *src_ptl) +{ + pte_t zero_pte; + + double_pt_lock(dst_ptl, src_ptl); + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } + + zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + dst_vma->vm_page_prot)); + ptep_clear_flush(src_vma, src_addr, src_pte); + set_pte_at(mm, dst_addr, dst_pte, zero_pte); + double_pt_unlock(dst_ptl, src_ptl); + + return PAGE_SIZE; +} + + +/* + * The mmap_lock for reading is held by the caller. Just move the page(s) + * from src_pmd to dst_pmd if possible, and return number of bytes moved. + * On failure, an error code is returned. + */ +static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + unsigned long len, __u64 mode) +{ + struct swap_info_struct *si = NULL; + pte_t orig_src_pte, orig_dst_pte; + pte_t src_folio_pte; + spinlock_t *src_ptl, *dst_ptl; + pte_t *src_pte = NULL; + pte_t *dst_pte = NULL; + pmd_t dummy_pmdval; + pmd_t dst_pmdval; + struct folio *src_folio = NULL; + struct mmu_notifier_range range; + long ret = 0; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + src_addr, src_addr + len); + mmu_notifier_invalidate_range_start(&range); +retry: + /* + * Use the maywrite version to indicate that dst_pte will be modified, + * since dst_pte needs to be none, the subsequent pte_same() check + * cannot prevent the dst_pte page from being freed concurrently, so we + * also need to abtain dst_pmdval and recheck pmd_same() later. + */ + dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, + &dst_ptl); + + /* Retry if a huge pmd materialized from under us */ + if (unlikely(!dst_pte)) { + ret = -EAGAIN; + goto out; + } + + /* + * Unlike dst_pte, the subsequent pte_same() check can ensure the + * stability of the src_pte page, so there is no need to get pmdval, + * just pass a dummy variable to it. + */ + src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, + &src_ptl); + + /* + * We held the mmap_lock for reading so MADV_DONTNEED + * can zap transparent huge pages under us, or the + * transparent huge page fault can establish new + * transparent huge pages under us. + */ + if (unlikely(!src_pte)) { + ret = -EAGAIN; + goto out; + } + + /* Sanity checks before the operation */ + if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || + pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { + ret = -EINVAL; + goto out; + } + + spin_lock(dst_ptl); + orig_dst_pte = ptep_get(dst_pte); + spin_unlock(dst_ptl); + if (!pte_none(orig_dst_pte)) { + ret = -EEXIST; + goto out; + } + + spin_lock(src_ptl); + orig_src_pte = ptep_get(src_pte); + spin_unlock(src_ptl); + if (pte_none(orig_src_pte)) { + if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) + ret = -ENOENT; + else /* nothing to do to move a hole */ + ret = PAGE_SIZE; + goto out; + } + + /* If PTE changed after we locked the folio them start over */ + if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { + ret = -EAGAIN; + goto out; + } + + if (pte_present(orig_src_pte)) { + if (is_zero_pfn(pte_pfn(orig_src_pte))) { + ret = move_zeropage_pte(mm, dst_vma, src_vma, + dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval, dst_ptl, src_ptl); + goto out; + } + + /* + * Pin and lock source folio. Since we are in RCU read section, + * we can't block, so on contention have to unmap the ptes, + * obtain the lock and retry. + */ + if (!src_folio) { + struct folio *folio; + bool locked; + + /* + * Pin the page while holding the lock to be sure the + * page isn't freed under us + */ + spin_lock(src_ptl); + if (!pte_same(orig_src_pte, ptep_get(src_pte))) { + spin_unlock(src_ptl); + ret = -EAGAIN; + goto out; + } + + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); + if (!folio || !PageAnonExclusive(&folio->page)) { + spin_unlock(src_ptl); + ret = -EBUSY; + goto out; + } + + locked = folio_trylock(folio); + /* + * We avoid waiting for folio lock with a raised + * refcount for large folios because extra refcounts + * will result in split_folio() failing later and + * retrying. If multiple tasks are trying to move a + * large folio we can end up livelocking. + */ + if (!locked && folio_test_large(folio)) { + spin_unlock(src_ptl); + ret = -EAGAIN; + goto out; + } + + folio_get(folio); + src_folio = folio; + src_folio_pte = orig_src_pte; + spin_unlock(src_ptl); + + if (!locked) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + /* now we can block and wait */ + folio_lock(src_folio); + goto retry; + } + + if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { + ret = -EBUSY; + goto out; + } + } + + /* at this point we have src_folio locked */ + if (folio_test_large(src_folio)) { + /* split_folio() can block */ + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + ret = split_folio(src_folio); + if (ret) + goto out; + /* have to reacquire the folio after it got split */ + folio_unlock(src_folio); + folio_put(src_folio); + src_folio = NULL; + goto retry; + } + + ret = move_present_ptes(mm, dst_vma, src_vma, + dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl, &src_folio, + len); + } else { /* !pte_present() */ + struct folio *folio = NULL; + const softleaf_t entry = softleaf_from_pte(orig_src_pte); + + if (softleaf_is_migration(entry)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + migration_entry_wait(mm, src_pmd, src_addr); + + ret = -EAGAIN; + goto out; + } else if (!softleaf_is_swap(entry)) { + ret = -EFAULT; + goto out; + } + + if (!pte_swp_exclusive(orig_src_pte)) { + ret = -EBUSY; + goto out; + } + + si = get_swap_device(entry); + if (unlikely(!si)) { + ret = -EAGAIN; + goto out; + } + /* + * Verify the existence of the swapcache. If present, the folio's + * index and mapping must be updated even when the PTE is a swap + * entry. The anon_vma lock is not taken during this process since + * the folio has already been unmapped, and the swap entry is + * exclusive, preventing rmap walks. + * + * For large folios, return -EBUSY immediately, as split_folio() + * also returns -EBUSY when attempting to split unmapped large + * folios in the swapcache. This issue needs to be resolved + * separately to allow proper handling. + */ + if (!src_folio) + folio = swap_cache_get_folio(entry); + if (folio) { + if (folio_test_large(folio)) { + ret = -EBUSY; + folio_put(folio); + goto out; + } + src_folio = folio; + src_folio_pte = orig_src_pte; + if (!folio_trylock(src_folio)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + put_swap_device(si); + si = NULL; + /* now we can block and wait */ + folio_lock(src_folio); + goto retry; + } + } + ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, + dst_ptl, src_ptl, src_folio, si, entry); + } + +out: + if (src_folio) { + folio_unlock(src_folio); + folio_put(src_folio); + } + /* + * Unmap in reverse order (LIFO) to maintain proper kmap_local + * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte + * first, then src_pte, so we must unmap src_pte first, then dst_pte. + */ + if (src_pte) + pte_unmap(src_pte); + if (dst_pte) + pte_unmap(dst_pte); + mmu_notifier_invalidate_range_end(&range); + if (si) + put_swap_device(si); + + return ret; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline bool move_splits_huge_pmd(unsigned long dst_addr, + unsigned long src_addr, + unsigned long src_end) +{ + return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || + src_end - src_addr < HPAGE_PMD_SIZE; +} +#else +static inline bool move_splits_huge_pmd(unsigned long dst_addr, + unsigned long src_addr, + unsigned long src_end) +{ + /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ + return false; +} +#endif + +static inline bool vma_move_compatible(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB | + VM_MIXEDMAP | VM_SHADOW_STACK)); +} + +static int validate_move_areas(struct userfaultfd_ctx *ctx, + struct vm_area_struct *src_vma, + struct vm_area_struct *dst_vma) +{ + /* Only allow moving if both have the same access and protection */ + if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || + pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) + return -EINVAL; + + /* Only allow moving if both are mlocked or both aren't */ + if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) + return -EINVAL; + + /* + * For now, we keep it simple and only move between writable VMAs. + * Access flags are equal, therefore checking only the source is enough. + */ + if (!(src_vma->vm_flags & VM_WRITE)) + return -EINVAL; + + /* Check if vma flags indicate content which can be moved */ + if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) + return -EINVAL; + + /* Ensure dst_vma is registered in uffd we are operating on */ + if (!dst_vma->vm_userfaultfd_ctx.ctx || + dst_vma->vm_userfaultfd_ctx.ctx != ctx) + return -EINVAL; + + /* Only allow moving across anonymous vmas */ + if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) + return -EINVAL; + + return 0; +} + +static __always_inline +int find_vmas_mm_locked(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + struct vm_area_struct *vma; + + mmap_assert_locked(mm); + vma = find_vma_and_prepare_anon(mm, dst_start); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + *dst_vmap = vma; + /* Skip finding src_vma if src_start is in dst_vma */ + if (src_start >= vma->vm_start && src_start < vma->vm_end) + goto out_success; + + vma = vma_lookup(mm, src_start); + if (!vma) + return -ENOENT; +out_success: + *src_vmap = vma; + return 0; +} + +#ifdef CONFIG_PER_VMA_LOCK +static int uffd_move_lock(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + struct vm_area_struct *vma; + int err; + + vma = uffd_lock_vma(mm, dst_start); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + *dst_vmap = vma; + /* + * Skip finding src_vma if src_start is in dst_vma. This also ensures + * that we don't lock the same vma twice. + */ + if (src_start >= vma->vm_start && src_start < vma->vm_end) { + *src_vmap = vma; + return 0; + } + + /* + * Using uffd_lock_vma() to get src_vma can lead to following deadlock: + * + * Thread1 Thread2 + * ------- ------- + * vma_start_read(dst_vma) + * mmap_write_lock(mm) + * vma_start_write(src_vma) + * vma_start_read(src_vma) + * mmap_read_lock(mm) + * vma_start_write(dst_vma) + */ + *src_vmap = lock_vma_under_rcu(mm, src_start); + if (likely(*src_vmap)) + return 0; + + /* Undo any locking and retry in mmap_lock critical section */ + vma_end_read(*dst_vmap); + + mmap_read_lock(mm); + err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); + if (err) + goto out; + + if (!vma_start_read_locked(*dst_vmap)) { + err = -EAGAIN; + goto out; + } + + /* Nothing further to do if both vmas are locked. */ + if (*dst_vmap == *src_vmap) + goto out; + + if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { + /* Undo dst_vmap locking if src_vmap failed to lock */ + vma_end_read(*dst_vmap); + err = -EAGAIN; + } +out: + mmap_read_unlock(mm); + return err; +} + +static void uffd_move_unlock(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ + vma_end_read(src_vma); + if (src_vma != dst_vma) + vma_end_read(dst_vma); +} + +#else + +static int uffd_move_lock(struct mm_struct *mm, + unsigned long dst_start, + unsigned long src_start, + struct vm_area_struct **dst_vmap, + struct vm_area_struct **src_vmap) +{ + int err; + + mmap_read_lock(mm); + err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); + if (err) + mmap_read_unlock(mm); + return err; +} + +static void uffd_move_unlock(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ + mmap_assert_locked(src_vma->vm_mm); + mmap_read_unlock(dst_vma->vm_mm); +} +#endif + +/** + * move_pages - move arbitrary anonymous pages of an existing vma + * @ctx: pointer to the userfaultfd context + * @dst_start: start of the destination virtual memory range + * @src_start: start of the source virtual memory range + * @len: length of the virtual memory range + * @mode: flags from uffdio_move.mode + * + * It will either use the mmap_lock in read mode or per-vma locks + * + * move_pages() remaps arbitrary anonymous pages atomically in zero + * copy. It only works on non shared anonymous pages because those can + * be relocated without generating non linear anon_vmas in the rmap + * code. + * + * It provides a zero copy mechanism to handle userspace page faults. + * The source vma pages should have mapcount == 1, which can be + * enforced by using madvise(MADV_DONTFORK) on src vma. + * + * The thread receiving the page during the userland page fault + * will receive the faulting page in the source vma through the network, + * storage or any other I/O device (MADV_DONTFORK in the source vma + * avoids move_pages() to fail with -EBUSY if the process forks before + * move_pages() is called), then it will call move_pages() to map the + * page in the faulting address in the destination vma. + * + * This userfaultfd command works purely via pagetables, so it's the + * most efficient way to move physical non shared anonymous pages + * across different virtual addresses. Unlike mremap()/mmap()/munmap() + * it does not create any new vmas. The mapping in the destination + * address is atomic. + * + * It only works if the vma protection bits are identical from the + * source and destination vma. + * + * It can remap non shared anonymous pages within the same vma too. + * + * If the source virtual memory range has any unmapped holes, or if + * the destination virtual memory range is not a whole unmapped hole, + * move_pages() will fail respectively with -ENOENT or -EEXIST. This + * provides a very strict behavior to avoid any chance of memory + * corruption going unnoticed if there are userland race conditions. + * Only one thread should resolve the userland page fault at any given + * time for any given faulting address. This means that if two threads + * try to both call move_pages() on the same destination address at the + * same time, the second thread will get an explicit error from this + * command. + * + * The command retval will return "len" is successful. The command + * however can be interrupted by fatal signals or errors. If + * interrupted it will return the number of bytes successfully + * remapped before the interruption if any, or the negative error if + * none. It will never return zero. Either it will return an error or + * an amount of bytes successfully moved. If the retval reports a + * "short" remap, the move_pages() command should be repeated by + * userland with src+retval, dst+reval, len-retval if it wants to know + * about the error that interrupted it. + * + * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to + * prevent -ENOENT errors to materialize if there are holes in the + * source virtual range that is being remapped. The holes will be + * accounted as successfully remapped in the retval of the + * command. This is mostly useful to remap hugepage naturally aligned + * virtual regions without knowing if there are transparent hugepage + * in the regions or not, but preventing the risk of having to split + * the hugepmd during the remap. + */ +ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, __u64 mode) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *src_vma, *dst_vma; + unsigned long src_addr, dst_addr, src_end; + pmd_t *src_pmd, *dst_pmd; + long err = -EINVAL; + ssize_t moved = 0; + + /* Sanitize the command parameters. */ + VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + VM_WARN_ON_ONCE(src_start + len < src_start); + VM_WARN_ON_ONCE(dst_start + len < dst_start); + + err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); + if (err) + goto out; + + /* Re-check after taking map_changing_lock */ + err = -EAGAIN; + down_read(&ctx->map_changing_lock); + if (likely(atomic_read(&ctx->mmap_changing))) + goto out_unlock; + /* + * Make sure the vma is not shared, that the src and dst remap + * ranges are both valid and fully within a single existing + * vma. + */ + err = -EINVAL; + if (src_vma->vm_flags & VM_SHARED) + goto out_unlock; + if (src_start + len > src_vma->vm_end) + goto out_unlock; + + if (dst_vma->vm_flags & VM_SHARED) + goto out_unlock; + if (dst_start + len > dst_vma->vm_end) + goto out_unlock; + + err = validate_move_areas(ctx, src_vma, dst_vma); + if (err) + goto out_unlock; + + for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; + src_addr < src_end;) { + spinlock_t *ptl; + pmd_t dst_pmdval; + unsigned long step_size; + + /* + * Below works because anonymous area would not have a + * transparent huge PUD. If file-backed support is added, + * that case would need to be handled here. + */ + src_pmd = mm_find_pmd(mm, src_addr); + if (unlikely(!src_pmd)) { + if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { + err = -ENOENT; + break; + } + src_pmd = mm_alloc_pmd(mm, src_addr); + if (unlikely(!src_pmd)) { + err = -ENOMEM; + break; + } + } + dst_pmd = mm_alloc_pmd(mm, dst_addr); + if (unlikely(!dst_pmd)) { + err = -ENOMEM; + break; + } + + dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is mapped as THP don't override it and just + * be strict. If dst_pmd changes into TPH after this check, the + * move_pages_huge_pmd() will detect the change and retry + * while move_pages_pte() will detect the change and fail. + */ + if (unlikely(pmd_trans_huge(dst_pmdval))) { + err = -EEXIST; + break; + } + + ptl = pmd_trans_huge_lock(src_pmd, src_vma); + if (ptl) { + /* Check if we can move the pmd without splitting it. */ + if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || + !pmd_none(dst_pmdval)) { + /* Can be a migration entry */ + if (pmd_present(*src_pmd)) { + struct folio *folio = pmd_folio(*src_pmd); + + if (!is_huge_zero_folio(folio) && + !PageAnonExclusive(&folio->page)) { + spin_unlock(ptl); + err = -EBUSY; + break; + } + } + + spin_unlock(ptl); + split_huge_pmd(src_vma, src_pmd, src_addr); + /* The folio will be split by move_pages_pte() */ + continue; + } + + err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, + dst_pmdval, dst_vma, src_vma, + dst_addr, src_addr); + step_size = HPAGE_PMD_SIZE; + } else { + long ret; + + if (pmd_none(*src_pmd)) { + if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { + err = -ENOENT; + break; + } + if (unlikely(__pte_alloc(mm, src_pmd))) { + err = -ENOMEM; + break; + } + } + + if (unlikely(pte_alloc(mm, dst_pmd))) { + err = -ENOMEM; + break; + } + + ret = move_pages_ptes(mm, dst_pmd, src_pmd, + dst_vma, src_vma, dst_addr, + src_addr, src_end - src_addr, mode); + if (ret < 0) + err = ret; + else + step_size = ret; + } + + cond_resched(); + + if (fatal_signal_pending(current)) { + /* Do not override an error */ + if (!err || err == -EAGAIN) + err = -EINTR; + break; + } + + if (err) { + if (err == -EAGAIN) + continue; + break; + } + + /* Proceed to the next page */ + dst_addr += step_size; + src_addr += step_size; + moved += step_size; + } + +out_unlock: + up_read(&ctx->map_changing_lock); + uffd_move_unlock(dst_vma, src_vma); +out: + VM_WARN_ON_ONCE(moved < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!moved && !err); + return moved ? moved : err; +} + +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; + + vm_flags_reset(vma, vm_flags); + /* + * For shared mappings, we want to enable writenotify while + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. + */ + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) + vma_set_page_prot(vma); +} + +static void userfaultfd_set_ctx(struct vm_area_struct *vma, + struct userfaultfd_ctx *ctx, + vm_flags_t vm_flags) +{ + vma_start_write(vma); + vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; + userfaultfd_set_vm_flags(vma, + (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); +} + +void userfaultfd_reset_ctx(struct vm_area_struct *vma) +{ + userfaultfd_set_ctx(vma, NULL, 0); +} + +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + struct vm_area_struct *ret; + bool give_up_on_oom = false; + + /* + * If we are modifying only and not splitting, just give up on the merge + * if OOM prevents us from merging successfully. + */ + if (start == vma->vm_start && end == vma->vm_end) + give_up_on_oom = true; + + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(vma, start, end - start, false); + + ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, + vma->vm_flags & ~__VM_UFFD_FLAGS, + NULL_VM_UFFD_CTX, give_up_on_oom); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + if (!IS_ERR(ret)) + userfaultfd_reset_ctx(ret); + + return ret; +} + +/* Assumes mmap write lock taken, and mm_struct pinned. */ +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, + vm_flags_t vm_flags, + unsigned long start, unsigned long end, + bool wp_async) +{ + VMA_ITERATOR(vmi, ctx->mm, start); + struct vm_area_struct *prev = vma_prev(&vmi); + unsigned long vma_end; + vm_flags_t new_flags; + + if (vma->vm_start < start) + prev = vma; + + for_each_vma_range(vmi, vma, end) { + cond_resched(); + + VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); + VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (vma->vm_userfaultfd_ctx.ctx == ctx && + (vma->vm_flags & vm_flags) == vm_flags) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, + (struct vm_userfaultfd_ctx){ctx}, + /* give_up_on_oom = */false); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + userfaultfd_set_ctx(vma, ctx, vm_flags); + + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) + hugetlb_unshare_all_pmds(vma); + +skip: + prev = vma; + start = vma->vm_end; + } + + return 0; +} + +void userfaultfd_release_new(struct userfaultfd_ctx *ctx) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + /* the various vma->vm_userfaultfd_ctx still points to it */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + if (vma->vm_userfaultfd_ctx.ctx == ctx) + userfaultfd_reset_ctx(vma); + } + mmap_write_unlock(mm); +} + +void userfaultfd_release_all(struct mm_struct *mm, + struct userfaultfd_ctx *ctx) +{ + struct vm_area_struct *vma, *prev; + VMA_ITERATOR(vmi, mm, 0); + + if (!mmget_not_zero(mm)) + return; + + /* + * Flush page faults out of all CPUs. NOTE: all page faults + * must be retried without returning VM_FAULT_SIGBUS if + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx + * changes while handle_userfault released the mmap_lock. So + * it's critical that released is set to true (above), before + * taking the mmap_lock for writing. + */ + mmap_write_lock(mm); + prev = NULL; + for_each_vma(vmi, vma) { + cond_resched(); + VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & __VM_UFFD_FLAGS)); + if (vma->vm_userfaultfd_ctx.ctx != ctx) { + prev = vma; + continue; + } + + vma = userfaultfd_clear_vma(&vmi, prev, vma, + vma->vm_start, vma->vm_end); + prev = vma; + } + mmap_write_unlock(mm); + mmput(mm); +} |
