summaryrefslogtreecommitdiff
path: root/mm/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/userfaultfd.c')
-rw-r--r--mm/userfaultfd.c1903
1 files changed, 1614 insertions, 289 deletions
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0499907b6f1a..e6dfd5f28acd 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -10,7 +10,7 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/userfaultfd_k.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
@@ -18,25 +18,14 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len)
+bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
{
- /*
- * Make sure that the dst range is both valid and fully within a
- * single existing vma.
- */
- struct vm_area_struct *dst_vma;
-
- dst_vma = find_vma(dst_mm, dst_start);
- if (!dst_vma)
- return NULL;
-
- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
- return NULL;
+ /* Make sure that the dst range is fully within dst_vma. */
+ if (dst_end > dst_vma->vm_end)
+ return false;
/*
* Check the vma is registered in uffd, this is required to
@@ -44,84 +33,191 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
* time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
- return NULL;
+ return false;
+
+ return true;
+}
+
+static __always_inline
+struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ vma = ERR_PTR(-ENOENT);
+ else if (!(vma->vm_flags & VM_SHARED) &&
+ unlikely(anon_vma_prepare(vma)))
+ vma = ERR_PTR(-ENOMEM);
+
+ return vma;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+/*
+ * uffd_lock_vma() - Lookup and lock vma corresponding to @address.
+ * @mm: mm to search vma in.
+ * @address: address that the vma should contain.
+ *
+ * Should be called without holding mmap_lock.
+ *
+ * Return: A locked vma containing @address, -ENOENT if no vma is found, or
+ * -ENOMEM if anon_vma couldn't be allocated.
+ */
+static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct vm_area_struct *vma;
+
+ vma = lock_vma_under_rcu(mm, address);
+ if (vma) {
+ /*
+ * We know we're going to need to use anon_vma, so check
+ * that early.
+ */
+ if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
+ vma_end_read(vma);
+ else
+ return vma;
+ }
+
+ mmap_read_lock(mm);
+ vma = find_vma_and_prepare_anon(mm, address);
+ if (!IS_ERR(vma)) {
+ bool locked = vma_start_read_locked(vma);
+
+ if (!locked)
+ vma = ERR_PTR(-EAGAIN);
+ }
+
+ mmap_read_unlock(mm);
+ return vma;
+}
+
+static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ struct vm_area_struct *dst_vma;
+
+ dst_vma = uffd_lock_vma(dst_mm, dst_start);
+ if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len))
+ return dst_vma;
+
+ vma_end_read(dst_vma);
+ return ERR_PTR(-ENOENT);
+}
+
+static void uffd_mfill_unlock(struct vm_area_struct *vma)
+{
+ vma_end_read(vma);
+}
+
+#else
+
+static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len)
+{
+ struct vm_area_struct *dst_vma;
+ mmap_read_lock(dst_mm);
+ dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start);
+ if (IS_ERR(dst_vma))
+ goto out_unlock;
+
+ if (validate_dst_vma(dst_vma, dst_start + len))
+ return dst_vma;
+
+ dst_vma = ERR_PTR(-ENOENT);
+out_unlock:
+ mmap_read_unlock(dst_mm);
return dst_vma;
}
+static void uffd_mfill_unlock(struct vm_area_struct *vma)
+{
+ mmap_read_unlock(vma->vm_mm);
+}
+#endif
+
+/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
+static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ struct inode *inode;
+ pgoff_t offset, max_off;
+
+ if (!dst_vma->vm_file)
+ return false;
+
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ return offset >= max_off;
+}
+
/*
* Install PTEs, to map dst_addr (within dst_vma) to page.
*
* This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
* and anon, and for both shared and private VMAs.
*/
-int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+int mfill_atomic_install_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, struct page *page,
- bool newly_allocated, bool wp_copy)
+ bool newly_allocated, uffd_flags_t flags)
{
int ret;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
pte_t _dst_pte, *dst_pte;
bool writable = dst_vma->vm_flags & VM_WRITE;
bool vm_shared = dst_vma->vm_flags & VM_SHARED;
- bool page_in_cache = page_mapping(page);
spinlock_t *ptl;
- struct folio *folio;
- struct inode *inode;
- pgoff_t offset, max_off;
+ struct folio *folio = page_folio(page);
+ bool page_in_cache = folio_mapping(folio);
+ pte_t dst_ptep;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
_dst_pte = pte_mkdirty(_dst_pte);
if (page_in_cache && !vm_shared)
writable = false;
-
- /*
- * Always mark a PTE as write-protected when needed, regardless of
- * VM_WRITE, which the user might change.
- */
- if (wp_copy) {
- _dst_pte = pte_mkuffd_wp(_dst_pte);
- writable = false;
- }
-
if (writable)
- _dst_pte = pte_mkwrite(_dst_pte);
- else
- /*
- * We need this to make sure write bit removed; as mk_pte()
- * could return a pte with write bit set.
- */
- _dst_pte = pte_wrprotect(_dst_pte);
+ _dst_pte = pte_mkwrite(_dst_pte, dst_vma);
+ if (flags & MFILL_ATOMIC_WP)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
+ ret = -EAGAIN;
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
- if (vma_is_shmem(dst_vma)) {
- /* serialize against truncate with the page table lock */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (mfill_file_over_size(dst_vma, dst_addr)) {
ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_unlock;
+ goto out_unlock;
}
ret = -EEXIST;
+
+ dst_ptep = ptep_get(dst_pte);
+
/*
- * We allow to overwrite a pte marker: consider when both MISSING|WP
- * registered, we firstly wr-protect a none pte which has no page cache
- * page backing it, then access the page.
+ * We are allowed to overwrite a UFFD pte marker: consider when both
+ * MISSING|WP registered, we firstly wr-protect a none pte which has no
+ * page cache page backing it, then access the page.
*/
- if (!pte_none_mostly(*dst_pte))
+ if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep))
goto out_unlock;
- folio = page_folio(page);
if (page_in_cache) {
/* Usually, cache pages are already added to LRU */
if (newly_allocated)
folio_add_lru(folio);
- page_add_file_rmap(page, dst_vma, false);
+ folio_add_file_rmap_pte(folio, page, dst_vma);
} else {
- page_add_new_anon_rmap(page, dst_vma, dst_addr);
+ folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, dst_vma);
}
@@ -129,7 +225,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
* Must happen after rmap, as mm_counter() checks mapping (via
* PageAnon()), which is set by __page_set_anon_rmap().
*/
- inc_mm_counter(dst_mm, mm_counter(page));
+ inc_mm_counter(dst_mm, mm_counter(folio));
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -138,28 +234,29 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
ret = 0;
out_unlock:
pte_unmap_unlock(dst_pte, ptl);
+out:
return ret;
}
-static int mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep,
- bool wp_copy)
+static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ uffd_flags_t flags,
+ struct folio **foliop)
{
- void *page_kaddr;
+ void *kaddr;
int ret;
- struct page *page;
+ struct folio *folio;
- if (!*pagep) {
+ if (!*foliop) {
ret = -ENOMEM;
- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
- if (!page)
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
+ dst_addr);
+ if (!folio)
goto out;
- page_kaddr = kmap_local_page(page);
+ kaddr = kmap_local_folio(folio, 0);
/*
* The read mmap_lock is held here. Despite the
* mmap_lock being read recursive a deadlock is still
@@ -176,89 +273,118 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
* and retry the copy outside the mmap_lock.
*/
pagefault_disable();
- ret = copy_from_user(page_kaddr,
- (const void __user *) src_addr,
+ ret = copy_from_user(kaddr, (const void __user *) src_addr,
PAGE_SIZE);
pagefault_enable();
- kunmap_local(page_kaddr);
+ kunmap_local(kaddr);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
- *pagep = page;
+ *foliop = folio;
/* don't free the page */
goto out;
}
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
} else {
- page = *pagep;
- *pagep = NULL;
+ folio = *foliop;
+ *foliop = NULL;
}
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
ret = -ENOMEM;
- if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
+ if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
goto out_release;
- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- page, true, wp_copy);
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ &folio->page, true, flags);
if (ret)
goto out_release;
out:
return ret;
out_release:
- put_page(page);
+ folio_put(folio);
goto out;
}
-static int mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
+static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ struct folio *folio;
+ int ret = -ENOMEM;
+
+ folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr);
+ if (!folio)
+ return ret;
+
+ if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL))
+ goto out_put;
+
+ /*
+ * The memory barrier inside __folio_mark_uptodate makes sure that
+ * zeroing out the folio become visible before mapping the page
+ * using set_pte_at(). See do_anonymous_page().
+ */
+ __folio_mark_uptodate(folio);
+
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ &folio->page, true, 0);
+ if (ret)
+ goto out_put;
+
+ return 0;
+out_put:
+ folio_put(folio);
+ return ret;
+}
+
+static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
{
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
int ret;
- pgoff_t offset, max_off;
- struct inode *inode;
+
+ if (mm_forbids_zeropage(dst_vma->vm_mm))
+ return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
- if (dst_vma->vm_file) {
- /* the shmem MAP_PRIVATE case requires checking the i_size */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EAGAIN;
+ dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
+ if (mfill_file_over_size(dst_vma, dst_addr)) {
ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_unlock;
+ goto out_unlock;
}
ret = -EEXIST;
- if (!pte_none(*dst_pte))
+ if (!pte_none(ptep_get(dst_pte)))
goto out_unlock;
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
ret = 0;
out_unlock:
pte_unmap_unlock(dst_pte, ptl);
+out:
return ret;
}
/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- bool wp_copy)
+static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ uffd_flags_t flags)
{
struct inode *inode = file_inode(dst_vma->vm_file);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -266,7 +392,7 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
struct page *page;
int ret;
- ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
+ ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
/* Our caller expects us to return -EFAULT if we failed to find folio */
if (ret == -ENOENT)
ret = -EFAULT;
@@ -283,8 +409,8 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
goto out_release;
}
- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- page, false, wp_copy);
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+ page, false, flags);
if (ret)
goto out_release;
@@ -298,6 +424,44 @@ out_release:
goto out;
}
+/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
+static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ uffd_flags_t flags)
+{
+ int ret;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+
+ _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
+ ret = -EAGAIN;
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ if (!dst_pte)
+ goto out;
+
+ if (mfill_file_over_size(dst_vma, dst_addr)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = -EEXIST;
+ /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */
+ if (!pte_none(ptep_get(dst_pte)))
+ goto out_unlock;
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+out:
+ return ret;
+}
+
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
@@ -321,23 +485,24 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE
/*
- * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * mfill_atomic processing for HUGETLB vmas. Note that this routine is
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
*/
-static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic_hugetlb(
+ struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- enum mcopy_atomic_mode mode,
- bool wp_copy)
+ uffd_flags_t flags)
{
- int vm_shared = dst_vma->vm_flags & VM_SHARED;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
ssize_t err;
pte_t *dst_pte;
unsigned long src_addr, dst_addr;
long copied;
- struct page *page;
+ struct folio *folio;
unsigned long vma_hpagesize;
pgoff_t idx;
u32 hash;
@@ -349,15 +514,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
* by THP. Since we can not reliably insert a zero page, this
* feature is not supported.
*/
- if (mode == MCOPY_ATOMIC_ZEROPAGE) {
- mmap_read_unlock(dst_mm);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
return -EINVAL;
}
src_addr = src_start;
dst_addr = dst_start;
copied = 0;
- page = NULL;
+ folio = NULL;
vma_hpagesize = vma_kernel_pagesize(dst_vma);
/*
@@ -373,29 +539,33 @@ retry:
* retry, dst_vma will be set to NULL and we must lookup again.
*/
if (!dst_vma) {
+ dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
+ if (IS_ERR(dst_vma)) {
+ err = PTR_ERR(dst_vma);
+ goto out;
+ }
+
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
- goto out_unlock;
+ if (!is_vm_hugetlb_page(dst_vma))
+ goto out_unlock_vma;
err = -EINVAL;
if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
- goto out_unlock;
-
- vm_shared = dst_vma->vm_flags & VM_SHARED;
- }
+ goto out_unlock_vma;
- /*
- * If not shared, ensure the dst_vma has a anon_vma.
- */
- err = -ENOMEM;
- if (!vm_shared) {
- if (unlikely(anon_vma_prepare(dst_vma)))
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ down_read(&ctx->map_changing_lock);
+ err = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
}
while (src_addr < src_start + len) {
- BUG_ON(dst_addr >= dst_start + len);
+ VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
/*
* Serialize via vma_lock and hugetlb_fault_mutex.
@@ -417,17 +587,19 @@ retry:
goto out_unlock;
}
- if (mode != MCOPY_ATOMIC_CONTINUE &&
- !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
- err = -EEXIST;
- hugetlb_vma_unlock_read(dst_vma);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- goto out_unlock;
+ if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
+ const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte);
+
+ if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) {
+ err = -EEXIST;
+ hugetlb_vma_unlock_read(dst_vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out_unlock;
+ }
}
- err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
- dst_addr, src_addr, mode, &page,
- wp_copy);
+ err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
+ src_addr, flags, &folio);
hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -435,23 +607,21 @@ retry:
cond_resched();
if (unlikely(err == -ENOENT)) {
- mmap_read_unlock(dst_mm);
- BUG_ON(!page);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
+ VM_WARN_ON_ONCE(!folio);
- err = copy_huge_page_from_user(page,
- (const void __user *)src_addr,
- vma_hpagesize / PAGE_SIZE,
- true);
+ err = copy_folio_from_user(folio,
+ (const void __user *)src_addr, true);
if (unlikely(err)) {
err = -EFAULT;
goto out;
}
- mmap_read_lock(dst_mm);
dst_vma = NULL;
goto retry;
} else
- BUG_ON(page);
+ VM_WARN_ON_ONCE(folio);
if (!err) {
dst_addr += vma_hpagesize;
@@ -466,40 +636,42 @@ retry:
}
out_unlock:
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+out_unlock_vma:
+ uffd_mfill_unlock(dst_vma);
out:
- if (page)
- put_page(page);
- BUG_ON(copied < 0);
- BUG_ON(err > 0);
- BUG_ON(!copied && !err);
+ if (folio)
+ folio_put(folio);
+ VM_WARN_ON_ONCE(copied < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!copied && !err);
return copied ? copied : err;
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
-extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
- struct vm_area_struct *dst_vma,
- unsigned long dst_start,
- unsigned long src_start,
- unsigned long len,
- enum mcopy_atomic_mode mode,
- bool wp_copy);
+extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
-static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
+static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- struct page **page,
- enum mcopy_atomic_mode mode,
- bool wp_copy)
+ uffd_flags_t flags,
+ struct folio **foliop)
{
ssize_t err;
- if (mode == MCOPY_ATOMIC_CONTINUE) {
- return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- wp_copy);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
+ return mfill_atomic_pte_continue(dst_pmd, dst_vma,
+ dst_addr, flags);
+ } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
+ return mfill_atomic_pte_poison(dst_pmd, dst_vma,
+ dst_addr, flags);
}
/*
@@ -513,72 +685,69 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
* and not in the radix tree.
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (mode == MCOPY_ATOMIC_NORMAL)
- err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, page,
- wp_copy);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
+ err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
+ dst_addr, src_addr,
+ flags, foliop);
else
- err = mfill_zeropage_pte(dst_mm, dst_pmd,
+ err = mfill_atomic_pte_zeropage(dst_pmd,
dst_vma, dst_addr);
} else {
- err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
dst_addr, src_addr,
- mode != MCOPY_ATOMIC_NORMAL,
- wp_copy, page);
+ flags, foliop);
}
return err;
}
-static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long src_start,
- unsigned long len,
- enum mcopy_atomic_mode mcopy_mode,
- atomic_t *mmap_changing,
- __u64 mode)
+static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ uffd_flags_t flags)
{
+ struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma;
ssize_t err;
pmd_t *dst_pmd;
unsigned long src_addr, dst_addr;
long copied;
- struct page *page;
- bool wp_copy;
+ struct folio *folio;
/*
* Sanitize the command parameters:
*/
- BUG_ON(dst_start & ~PAGE_MASK);
- BUG_ON(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(src_start + len <= src_start);
- BUG_ON(dst_start + len <= dst_start);
+ VM_WARN_ON_ONCE(src_start + len <= src_start);
+ VM_WARN_ON_ONCE(dst_start + len <= dst_start);
src_addr = src_start;
dst_addr = dst_start;
copied = 0;
- page = NULL;
+ folio = NULL;
retry:
- mmap_read_lock(dst_mm);
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
+ if (IS_ERR(dst_vma)) {
+ err = PTR_ERR(dst_vma);
+ goto out;
+ }
/*
* If memory mappings are changing because of non-cooperative
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
+ down_read(&ctx->map_changing_lock);
err = -EAGAIN;
- if (mmap_changing && atomic_read(mmap_changing))
- goto out_unlock;
-
- /*
- * Make sure the vma is not shared, that the dst range is
- * both valid and fully within a single existing vma.
- */
- err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma)
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
err = -EINVAL;
@@ -594,37 +763,26 @@ retry:
* validate 'mode' now that we know the dst_vma: don't allow
* a wrprotect copy if the userfaultfd didn't register as WP.
*/
- wp_copy = mode & UFFDIO_COPY_MODE_WP;
- if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+ if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
goto out_unlock;
/*
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
- src_start, len, mcopy_mode,
- wp_copy);
+ return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
+ src_start, len, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
- if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
- goto out_unlock;
-
- /*
- * Ensure the dst_vma has a anon_vma or this page
- * would get a NULL anon_vma when moved in the
- * dst_vma.
- */
- err = -ENOMEM;
- if (!(dst_vma->vm_flags & VM_SHARED) &&
- unlikely(anon_vma_prepare(dst_vma)))
+ if (!vma_is_shmem(dst_vma) &&
+ uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
goto out_unlock;
while (src_addr < src_start + len) {
pmd_t dst_pmdval;
- BUG_ON(dst_addr >= dst_start + len);
+ VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
if (unlikely(!dst_pmd)) {
@@ -633,51 +791,55 @@ retry:
}
dst_pmdval = pmdp_get_lockless(dst_pmd);
- /*
- * If the dst_pmd is mapped as THP don't
- * override it and just be strict.
- */
- if (unlikely(pmd_trans_huge(dst_pmdval))) {
- err = -EEXIST;
- break;
- }
if (unlikely(pmd_none(dst_pmdval)) &&
unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM;
break;
}
- /* If an huge pmd materialized from under us fail */
- if (unlikely(pmd_trans_huge(*dst_pmd))) {
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ /*
+ * If the dst_pmd is THP don't override it and just be strict.
+ * (This includes the case where the PMD used to be THP and
+ * changed back to none after __pte_alloc().)
+ */
+ if (unlikely(!pmd_present(dst_pmdval) ||
+ pmd_trans_huge(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
+ if (unlikely(pmd_bad(dst_pmdval))) {
err = -EFAULT;
break;
}
+ /*
+ * For shmem mappings, khugepaged is allowed to remove page
+ * tables under us; pte_offset_map_lock() will deal with that.
+ */
- BUG_ON(pmd_none(*dst_pmd));
- BUG_ON(pmd_trans_huge(*dst_pmd));
-
- err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, mcopy_mode, wp_copy);
+ err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
+ src_addr, flags, &folio);
cond_resched();
if (unlikely(err == -ENOENT)) {
- void *page_kaddr;
+ void *kaddr;
- mmap_read_unlock(dst_mm);
- BUG_ON(!page);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
+ VM_WARN_ON_ONCE(!folio);
- page_kaddr = kmap_local_page(page);
- err = copy_from_user(page_kaddr,
+ kaddr = kmap_local_folio(folio, 0);
+ err = copy_from_user(kaddr,
(const void __user *) src_addr,
PAGE_SIZE);
- kunmap_local(page_kaddr);
+ kunmap_local(kaddr);
if (unlikely(err)) {
err = -EFAULT;
goto out;
}
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
goto retry;
} else
- BUG_ON(page);
+ VM_WARN_ON_ONCE(folio);
if (!err) {
dst_addr += PAGE_SIZE;
@@ -692,71 +854,104 @@ retry:
}
out_unlock:
- mmap_read_unlock(dst_mm);
+ up_read(&ctx->map_changing_lock);
+ uffd_mfill_unlock(dst_vma);
out:
- if (page)
- put_page(page);
- BUG_ON(copied < 0);
- BUG_ON(err > 0);
- BUG_ON(!copied && !err);
+ if (folio)
+ folio_put(folio);
+ VM_WARN_ON_ONCE(copied < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!copied && !err);
return copied ? copied : err;
}
-ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, __u64 mode)
+ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long src_start, unsigned long len,
+ uffd_flags_t flags)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len,
- MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
+ return mfill_atomic(ctx, dst_start, src_start, len,
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
}
-ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long start,
+ unsigned long len)
{
- return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
- mmap_changing, 0);
+ return mfill_atomic(ctx, start, 0, len,
+ uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
}
-ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags)
{
- return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
- mmap_changing, 0);
+
+ /*
+ * A caller might reasonably assume that UFFDIO_CONTINUE contains an
+ * smp_wmb() to ensure that any writes to the about-to-be-mapped page by
+ * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to
+ * subsequent loads from the page through the newly mapped address range.
+ */
+ smp_wmb();
+
+ return mfill_atomic(ctx, start, 0, len,
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
-void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
+ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags)
+{
+ return mfill_atomic(ctx, start, 0, len,
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
+}
+
+long uffd_wp_range(struct vm_area_struct *dst_vma,
unsigned long start, unsigned long len, bool enable_wp)
{
+ unsigned int mm_cp_flags;
struct mmu_gather tlb;
- pgprot_t newprot;
+ long ret;
+ VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end,
+ "The address range exceeds VMA boundary.\n");
if (enable_wp)
- newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
+ mm_cp_flags = MM_CP_UFFD_WP;
else
- newprot = vm_get_page_prot(dst_vma->vm_flags);
+ mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
- tlb_gather_mmu(&tlb, dst_mm);
- change_protection(&tlb, dst_vma, start, start + len, newprot,
- enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+ /*
+ * vma->vm_page_prot already reflects that uffd-wp is enabled for this
+ * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed
+ * to be write-protected as default whenever protection changes.
+ * Try upgrading write permissions manually.
+ */
+ if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
+ mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
+ tlb_gather_mmu(&tlb, dst_vma->vm_mm);
+ ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
tlb_finish_mmu(&tlb);
+
+ return ret;
}
-int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool enable_wp,
- atomic_t *mmap_changing)
+int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, bool enable_wp)
{
+ struct mm_struct *dst_mm = ctx->mm;
+ unsigned long end = start + len;
+ unsigned long _start, _end;
struct vm_area_struct *dst_vma;
unsigned long page_mask;
- int err;
+ long err;
+ VMA_ITERATOR(vmi, dst_mm, start);
/*
* Sanitize the command parameters:
*/
- BUG_ON(start & ~PAGE_MASK);
- BUG_ON(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(start + len <= start);
+ VM_WARN_ON_ONCE(start + len <= start);
mmap_read_lock(dst_mm);
@@ -765,31 +960,1161 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
+ down_read(&ctx->map_changing_lock);
err = -EAGAIN;
- if (mmap_changing && atomic_read(mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, start, len);
+ for_each_vma_range(vmi, dst_vma, end) {
+
+ if (!userfaultfd_wp(dst_vma)) {
+ err = -ENOENT;
+ break;
+ }
+
+ if (is_vm_hugetlb_page(dst_vma)) {
+ err = -EINVAL;
+ page_mask = vma_kernel_pagesize(dst_vma) - 1;
+ if ((start & page_mask) || (len & page_mask))
+ break;
+ }
+
+ _start = max(dst_vma->vm_start, start);
+ _end = min(dst_vma->vm_end, end);
+
+ err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp);
+
+ /* Return 0 on success, <0 on failures */
+ if (err < 0)
+ break;
+ err = 0;
+ }
+out_unlock:
+ up_read(&ctx->map_changing_lock);
+ mmap_read_unlock(dst_mm);
+ return err;
+}
+
+
+void double_pt_lock(spinlock_t *ptl1,
+ spinlock_t *ptl2)
+ __acquires(ptl1)
+ __acquires(ptl2)
+{
+ if (ptl1 > ptl2)
+ swap(ptl1, ptl2);
+ /* lock in virtual address order to avoid lock inversion */
+ spin_lock(ptl1);
+ if (ptl1 != ptl2)
+ spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING);
+ else
+ __acquire(ptl2);
+}
+
+void double_pt_unlock(spinlock_t *ptl1,
+ spinlock_t *ptl2)
+ __releases(ptl1)
+ __releases(ptl2)
+{
+ spin_unlock(ptl1);
+ if (ptl1 != ptl2)
+ spin_unlock(ptl2);
+ else
+ __release(ptl2);
+}
+
+static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval)
+{
+ return pte_same(ptep_get(src_pte), orig_src_pte) &&
+ pte_same(ptep_get(dst_pte), orig_dst_pte) &&
+ pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd));
+}
+
+/*
+ * Checks if the two ptes and the corresponding folio are eligible for batched
+ * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL.
+ *
+ * NOTE: folio's reference is not required as the whole operation is within
+ * PTL's critical section.
+ */
+static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma,
+ unsigned long src_addr,
+ pte_t *src_pte, pte_t *dst_pte)
+{
+ pte_t orig_dst_pte, orig_src_pte;
+ struct folio *folio;
+
+ orig_dst_pte = ptep_get(dst_pte);
+ if (!pte_none(orig_dst_pte))
+ return NULL;
+
+ orig_src_pte = ptep_get(src_pte);
+ if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte)))
+ return NULL;
+
+ folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
+ if (!folio || !folio_trylock(folio))
+ return NULL;
+ if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) {
+ folio_unlock(folio);
+ return NULL;
+ }
+ return folio;
+}
+
+/*
+ * Moves src folios to dst in a batch as long as they are not large, and can
+ * successfully take the lock via folio_trylock().
+ */
+static long move_present_ptes(struct mm_struct *mm,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio **first_src_folio, unsigned long len)
+{
+ int err = 0;
+ struct folio *src_folio = *first_src_folio;
+ unsigned long src_start = src_addr;
+ unsigned long src_end;
+
+ len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr;
+ src_end = pmd_addr_end(src_addr, src_addr + len);
+ flush_cache_range(src_vma, src_addr, src_end);
+ double_pt_lock(dst_ptl, src_ptl);
+
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ if (folio_test_large(src_folio) ||
+ folio_maybe_dma_pinned(src_folio) ||
+ !PageAnonExclusive(&src_folio->page)) {
+ err = -EBUSY;
+ goto out;
+ }
+ /* It's safe to drop the reference now as the page-table is holding one. */
+ folio_put(*first_src_folio);
+ *first_src_folio = NULL;
+ arch_enter_lazy_mmu_mode();
+
+ while (true) {
+ orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+ /* Folio got pinned from under us. Put it back and fail the move. */
+ if (folio_maybe_dma_pinned(src_folio)) {
+ set_pte_at(mm, src_addr, src_pte, orig_src_pte);
+ err = -EBUSY;
+ break;
+ }
+
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
+
+ orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
+ if (pgtable_supports_soft_dirty())
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
+ set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
+
+ src_addr += PAGE_SIZE;
+ if (src_addr == src_end)
+ break;
+ dst_addr += PAGE_SIZE;
+ dst_pte++;
+ src_pte++;
+
+ folio_unlock(src_folio);
+ src_folio = check_ptes_for_batched_move(src_vma, src_addr,
+ src_pte, dst_pte);
+ if (!src_folio)
+ break;
+ }
+
+ arch_leave_lazy_mmu_mode();
+ if (src_addr > src_start)
+ flush_tlb_range(src_vma, src_start, src_addr);
+
+ if (src_folio)
+ folio_unlock(src_folio);
+out:
+ double_pt_unlock(dst_ptl, src_ptl);
+ return src_addr > src_start ? src_addr - src_start : err;
+}
+
+static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio *src_folio,
+ struct swap_info_struct *si, swp_entry_t entry)
+{
+ /*
+ * Check if the folio still belongs to the target swap entry after
+ * acquiring the lock. Folio can be freed in the swap cache while
+ * not locked.
+ */
+ if (src_folio && unlikely(!folio_test_swapcache(src_folio) ||
+ entry.val != src_folio->swap.val))
+ return -EAGAIN;
+
+ double_pt_lock(dst_ptl, src_ptl);
+
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
+
+ /*
+ * The src_folio resides in the swapcache, requiring an update to its
+ * index and mapping to align with the dst_vma, where a swap-in may
+ * occur and hit the swapcache after moving the PTE.
+ */
+ if (src_folio) {
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
+ } else {
+ /*
+ * Check if the swap entry is cached after acquiring the src_pte
+ * lock. Otherwise, we might miss a newly loaded swap cache folio.
+ *
+ * Check swap_map directly to minimize overhead, READ_ONCE is sufficient.
+ * We are trying to catch newly added swap cache, the only possible case is
+ * when a folio is swapped in and out again staying in swap cache, using the
+ * same entry before the PTE check above. The PTL is acquired and released
+ * twice, each time after updating the swap_map's flag. So holding
+ * the PTL here ensures we see the updated value. False positive is possible,
+ * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the
+ * cache, or during the tiny synchronization window between swap cache and
+ * swap_map, but it will be gone very quickly, worst result is retry jitters.
+ */
+ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
+ }
+
+ orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+ if (pgtable_supports_soft_dirty())
+ orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
+ set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
+ double_pt_unlock(dst_ptl, src_ptl);
+
+ return PAGE_SIZE;
+}
+
+static int move_zeropage_pte(struct mm_struct *mm,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ pte_t *dst_pte, pte_t *src_pte,
+ pte_t orig_dst_pte, pte_t orig_src_pte,
+ pmd_t *dst_pmd, pmd_t dst_pmdval,
+ spinlock_t *dst_ptl, spinlock_t *src_ptl)
+{
+ pte_t zero_pte;
+
+ double_pt_lock(dst_ptl, src_ptl);
+ if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval)) {
+ double_pt_unlock(dst_ptl, src_ptl);
+ return -EAGAIN;
+ }
+
+ zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+ dst_vma->vm_page_prot));
+ ptep_clear_flush(src_vma, src_addr, src_pte);
+ set_pte_at(mm, dst_addr, dst_pte, zero_pte);
+ double_pt_unlock(dst_ptl, src_ptl);
+
+ return PAGE_SIZE;
+}
+
+
+/*
+ * The mmap_lock for reading is held by the caller. Just move the page(s)
+ * from src_pmd to dst_pmd if possible, and return number of bytes moved.
+ * On failure, an error code is returned.
+ */
+static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ unsigned long len, __u64 mode)
+{
+ struct swap_info_struct *si = NULL;
+ pte_t orig_src_pte, orig_dst_pte;
+ pte_t src_folio_pte;
+ spinlock_t *src_ptl, *dst_ptl;
+ pte_t *src_pte = NULL;
+ pte_t *dst_pte = NULL;
+ pmd_t dummy_pmdval;
+ pmd_t dst_pmdval;
+ struct folio *src_folio = NULL;
+ struct mmu_notifier_range range;
+ long ret = 0;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ src_addr, src_addr + len);
+ mmu_notifier_invalidate_range_start(&range);
+retry:
+ /*
+ * Use the maywrite version to indicate that dst_pte will be modified,
+ * since dst_pte needs to be none, the subsequent pte_same() check
+ * cannot prevent the dst_pte page from being freed concurrently, so we
+ * also need to abtain dst_pmdval and recheck pmd_same() later.
+ */
+ dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval,
+ &dst_ptl);
+
+ /* Retry if a huge pmd materialized from under us */
+ if (unlikely(!dst_pte)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ /*
+ * Unlike dst_pte, the subsequent pte_same() check can ensure the
+ * stability of the src_pte page, so there is no need to get pmdval,
+ * just pass a dummy variable to it.
+ */
+ src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval,
+ &src_ptl);
+
+ /*
+ * We held the mmap_lock for reading so MADV_DONTNEED
+ * can zap transparent huge pages under us, or the
+ * transparent huge page fault can establish new
+ * transparent huge pages under us.
+ */
+ if (unlikely(!src_pte)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ /* Sanity checks before the operation */
+ if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
+ pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ spin_lock(dst_ptl);
+ orig_dst_pte = ptep_get(dst_pte);
+ spin_unlock(dst_ptl);
+ if (!pte_none(orig_dst_pte)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ spin_lock(src_ptl);
+ orig_src_pte = ptep_get(src_pte);
+ spin_unlock(src_ptl);
+ if (pte_none(orig_src_pte)) {
+ if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
+ ret = -ENOENT;
+ else /* nothing to do to move a hole */
+ ret = PAGE_SIZE;
+ goto out;
+ }
+
+ /* If PTE changed after we locked the folio them start over */
+ if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ if (pte_present(orig_src_pte)) {
+ if (is_zero_pfn(pte_pfn(orig_src_pte))) {
+ ret = move_zeropage_pte(mm, dst_vma, src_vma,
+ dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte,
+ dst_pmd, dst_pmdval, dst_ptl, src_ptl);
+ goto out;
+ }
+
+ /*
+ * Pin and lock source folio. Since we are in RCU read section,
+ * we can't block, so on contention have to unmap the ptes,
+ * obtain the lock and retry.
+ */
+ if (!src_folio) {
+ struct folio *folio;
+ bool locked;
+
+ /*
+ * Pin the page while holding the lock to be sure the
+ * page isn't freed under us
+ */
+ spin_lock(src_ptl);
+ if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
+ spin_unlock(src_ptl);
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
+ if (!folio || !PageAnonExclusive(&folio->page)) {
+ spin_unlock(src_ptl);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ locked = folio_trylock(folio);
+ /*
+ * We avoid waiting for folio lock with a raised
+ * refcount for large folios because extra refcounts
+ * will result in split_folio() failing later and
+ * retrying. If multiple tasks are trying to move a
+ * large folio we can end up livelocking.
+ */
+ if (!locked && folio_test_large(folio)) {
+ spin_unlock(src_ptl);
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ folio_get(folio);
+ src_folio = folio;
+ src_folio_pte = orig_src_pte;
+ spin_unlock(src_ptl);
+
+ if (!locked) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ /* now we can block and wait */
+ folio_lock(src_folio);
+ goto retry;
+ }
+
+ if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ /* at this point we have src_folio locked */
+ if (folio_test_large(src_folio)) {
+ /* split_folio() can block */
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ ret = split_folio(src_folio);
+ if (ret)
+ goto out;
+ /* have to reacquire the folio after it got split */
+ folio_unlock(src_folio);
+ folio_put(src_folio);
+ src_folio = NULL;
+ goto retry;
+ }
+
+ ret = move_present_ptes(mm, dst_vma, src_vma,
+ dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd,
+ dst_pmdval, dst_ptl, src_ptl, &src_folio,
+ len);
+ } else { /* !pte_present() */
+ struct folio *folio = NULL;
+ const softleaf_t entry = softleaf_from_pte(orig_src_pte);
+
+ if (softleaf_is_migration(entry)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ migration_entry_wait(mm, src_pmd, src_addr);
+
+ ret = -EAGAIN;
+ goto out;
+ } else if (!softleaf_is_swap(entry)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!pte_swp_exclusive(orig_src_pte)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ si = get_swap_device(entry);
+ if (unlikely(!si)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ /*
+ * Verify the existence of the swapcache. If present, the folio's
+ * index and mapping must be updated even when the PTE is a swap
+ * entry. The anon_vma lock is not taken during this process since
+ * the folio has already been unmapped, and the swap entry is
+ * exclusive, preventing rmap walks.
+ *
+ * For large folios, return -EBUSY immediately, as split_folio()
+ * also returns -EBUSY when attempting to split unmapped large
+ * folios in the swapcache. This issue needs to be resolved
+ * separately to allow proper handling.
+ */
+ if (!src_folio)
+ folio = swap_cache_get_folio(entry);
+ if (folio) {
+ if (folio_test_large(folio)) {
+ ret = -EBUSY;
+ folio_put(folio);
+ goto out;
+ }
+ src_folio = folio;
+ src_folio_pte = orig_src_pte;
+ if (!folio_trylock(src_folio)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ put_swap_device(si);
+ si = NULL;
+ /* now we can block and wait */
+ folio_lock(src_folio);
+ goto retry;
+ }
+ }
+ ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
+ dst_ptl, src_ptl, src_folio, si, entry);
+ }
+
+out:
+ if (src_folio) {
+ folio_unlock(src_folio);
+ folio_put(src_folio);
+ }
+ /*
+ * Unmap in reverse order (LIFO) to maintain proper kmap_local
+ * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte
+ * first, then src_pte, so we must unmap src_pte first, then dst_pte.
+ */
+ if (src_pte)
+ pte_unmap(src_pte);
+ if (dst_pte)
+ pte_unmap(dst_pte);
+ mmu_notifier_invalidate_range_end(&range);
+ if (si)
+ put_swap_device(si);
+
+ return ret;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline bool move_splits_huge_pmd(unsigned long dst_addr,
+ unsigned long src_addr,
+ unsigned long src_end)
+{
+ return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
+ src_end - src_addr < HPAGE_PMD_SIZE;
+}
+#else
+static inline bool move_splits_huge_pmd(unsigned long dst_addr,
+ unsigned long src_addr,
+ unsigned long src_end)
+{
+ /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */
+ return false;
+}
+#endif
+
+static inline bool vma_move_compatible(struct vm_area_struct *vma)
+{
+ return !(vma->vm_flags & (VM_PFNMAP | VM_IO | VM_HUGETLB |
+ VM_MIXEDMAP | VM_SHADOW_STACK));
+}
+
+static int validate_move_areas(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *src_vma,
+ struct vm_area_struct *dst_vma)
+{
+ /* Only allow moving if both have the same access and protection */
+ if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) ||
+ pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot))
+ return -EINVAL;
+
+ /* Only allow moving if both are mlocked or both aren't */
+ if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED))
+ return -EINVAL;
+
+ /*
+ * For now, we keep it simple and only move between writable VMAs.
+ * Access flags are equal, therefore checking only the source is enough.
+ */
+ if (!(src_vma->vm_flags & VM_WRITE))
+ return -EINVAL;
+
+ /* Check if vma flags indicate content which can be moved */
+ if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma))
+ return -EINVAL;
+
+ /* Ensure dst_vma is registered in uffd we are operating on */
+ if (!dst_vma->vm_userfaultfd_ctx.ctx ||
+ dst_vma->vm_userfaultfd_ctx.ctx != ctx)
+ return -EINVAL;
+
+ /* Only allow moving across anonymous vmas */
+ if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma))
+ return -EINVAL;
+
+ return 0;
+}
+
+static __always_inline
+int find_vmas_mm_locked(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+ vma = find_vma_and_prepare_anon(mm, dst_start);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ *dst_vmap = vma;
+ /* Skip finding src_vma if src_start is in dst_vma */
+ if (src_start >= vma->vm_start && src_start < vma->vm_end)
+ goto out_success;
+
+ vma = vma_lookup(mm, src_start);
+ if (!vma)
+ return -ENOENT;
+out_success:
+ *src_vmap = vma;
+ return 0;
+}
+
+#ifdef CONFIG_PER_VMA_LOCK
+static int uffd_move_lock(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ struct vm_area_struct *vma;
+ int err;
+
+ vma = uffd_lock_vma(mm, dst_start);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ *dst_vmap = vma;
+ /*
+ * Skip finding src_vma if src_start is in dst_vma. This also ensures
+ * that we don't lock the same vma twice.
+ */
+ if (src_start >= vma->vm_start && src_start < vma->vm_end) {
+ *src_vmap = vma;
+ return 0;
+ }
+
+ /*
+ * Using uffd_lock_vma() to get src_vma can lead to following deadlock:
+ *
+ * Thread1 Thread2
+ * ------- -------
+ * vma_start_read(dst_vma)
+ * mmap_write_lock(mm)
+ * vma_start_write(src_vma)
+ * vma_start_read(src_vma)
+ * mmap_read_lock(mm)
+ * vma_start_write(dst_vma)
+ */
+ *src_vmap = lock_vma_under_rcu(mm, src_start);
+ if (likely(*src_vmap))
+ return 0;
+
+ /* Undo any locking and retry in mmap_lock critical section */
+ vma_end_read(*dst_vmap);
+
+ mmap_read_lock(mm);
+ err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+ if (err)
+ goto out;
+
+ if (!vma_start_read_locked(*dst_vmap)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ /* Nothing further to do if both vmas are locked. */
+ if (*dst_vmap == *src_vmap)
+ goto out;
- if (!dst_vma)
+ if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) {
+ /* Undo dst_vmap locking if src_vmap failed to lock */
+ vma_end_read(*dst_vmap);
+ err = -EAGAIN;
+ }
+out:
+ mmap_read_unlock(mm);
+ return err;
+}
+
+static void uffd_move_unlock(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
+{
+ vma_end_read(src_vma);
+ if (src_vma != dst_vma)
+ vma_end_read(dst_vma);
+}
+
+#else
+
+static int uffd_move_lock(struct mm_struct *mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ struct vm_area_struct **dst_vmap,
+ struct vm_area_struct **src_vmap)
+{
+ int err;
+
+ mmap_read_lock(mm);
+ err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap);
+ if (err)
+ mmap_read_unlock(mm);
+ return err;
+}
+
+static void uffd_move_unlock(struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
+{
+ mmap_assert_locked(src_vma->vm_mm);
+ mmap_read_unlock(dst_vma->vm_mm);
+}
+#endif
+
+/**
+ * move_pages - move arbitrary anonymous pages of an existing vma
+ * @ctx: pointer to the userfaultfd context
+ * @dst_start: start of the destination virtual memory range
+ * @src_start: start of the source virtual memory range
+ * @len: length of the virtual memory range
+ * @mode: flags from uffdio_move.mode
+ *
+ * It will either use the mmap_lock in read mode or per-vma locks
+ *
+ * move_pages() remaps arbitrary anonymous pages atomically in zero
+ * copy. It only works on non shared anonymous pages because those can
+ * be relocated without generating non linear anon_vmas in the rmap
+ * code.
+ *
+ * It provides a zero copy mechanism to handle userspace page faults.
+ * The source vma pages should have mapcount == 1, which can be
+ * enforced by using madvise(MADV_DONTFORK) on src vma.
+ *
+ * The thread receiving the page during the userland page fault
+ * will receive the faulting page in the source vma through the network,
+ * storage or any other I/O device (MADV_DONTFORK in the source vma
+ * avoids move_pages() to fail with -EBUSY if the process forks before
+ * move_pages() is called), then it will call move_pages() to map the
+ * page in the faulting address in the destination vma.
+ *
+ * This userfaultfd command works purely via pagetables, so it's the
+ * most efficient way to move physical non shared anonymous pages
+ * across different virtual addresses. Unlike mremap()/mmap()/munmap()
+ * it does not create any new vmas. The mapping in the destination
+ * address is atomic.
+ *
+ * It only works if the vma protection bits are identical from the
+ * source and destination vma.
+ *
+ * It can remap non shared anonymous pages within the same vma too.
+ *
+ * If the source virtual memory range has any unmapped holes, or if
+ * the destination virtual memory range is not a whole unmapped hole,
+ * move_pages() will fail respectively with -ENOENT or -EEXIST. This
+ * provides a very strict behavior to avoid any chance of memory
+ * corruption going unnoticed if there are userland race conditions.
+ * Only one thread should resolve the userland page fault at any given
+ * time for any given faulting address. This means that if two threads
+ * try to both call move_pages() on the same destination address at the
+ * same time, the second thread will get an explicit error from this
+ * command.
+ *
+ * The command retval will return "len" is successful. The command
+ * however can be interrupted by fatal signals or errors. If
+ * interrupted it will return the number of bytes successfully
+ * remapped before the interruption if any, or the negative error if
+ * none. It will never return zero. Either it will return an error or
+ * an amount of bytes successfully moved. If the retval reports a
+ * "short" remap, the move_pages() command should be repeated by
+ * userland with src+retval, dst+reval, len-retval if it wants to know
+ * about the error that interrupted it.
+ *
+ * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to
+ * prevent -ENOENT errors to materialize if there are holes in the
+ * source virtual range that is being remapped. The holes will be
+ * accounted as successfully remapped in the retval of the
+ * command. This is mostly useful to remap hugepage naturally aligned
+ * virtual regions without knowing if there are transparent hugepage
+ * in the regions or not, but preventing the risk of having to split
+ * the hugepmd during the remap.
+ */
+ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long src_start, unsigned long len, __u64 mode)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *src_vma, *dst_vma;
+ unsigned long src_addr, dst_addr, src_end;
+ pmd_t *src_pmd, *dst_pmd;
+ long err = -EINVAL;
+ ssize_t moved = 0;
+
+ /* Sanitize the command parameters. */
+ VM_WARN_ON_ONCE(src_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
+
+ /* Does the address range wrap, or is the span zero-sized? */
+ VM_WARN_ON_ONCE(src_start + len < src_start);
+ VM_WARN_ON_ONCE(dst_start + len < dst_start);
+
+ err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma);
+ if (err)
+ goto out;
+
+ /* Re-check after taking map_changing_lock */
+ err = -EAGAIN;
+ down_read(&ctx->map_changing_lock);
+ if (likely(atomic_read(&ctx->mmap_changing)))
+ goto out_unlock;
+ /*
+ * Make sure the vma is not shared, that the src and dst remap
+ * ranges are both valid and fully within a single existing
+ * vma.
+ */
+ err = -EINVAL;
+ if (src_vma->vm_flags & VM_SHARED)
goto out_unlock;
- if (!userfaultfd_wp(dst_vma))
+ if (src_start + len > src_vma->vm_end)
+ goto out_unlock;
+
+ if (dst_vma->vm_flags & VM_SHARED)
goto out_unlock;
- if (!vma_can_userfault(dst_vma, dst_vma->vm_flags))
+ if (dst_start + len > dst_vma->vm_end)
goto out_unlock;
- if (is_vm_hugetlb_page(dst_vma)) {
- err = -EINVAL;
- page_mask = vma_kernel_pagesize(dst_vma) - 1;
- if ((start & page_mask) || (len & page_mask))
- goto out_unlock;
- }
+ err = validate_move_areas(ctx, src_vma, dst_vma);
+ if (err)
+ goto out_unlock;
+
+ for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len;
+ src_addr < src_end;) {
+ spinlock_t *ptl;
+ pmd_t dst_pmdval;
+ unsigned long step_size;
+
+ /*
+ * Below works because anonymous area would not have a
+ * transparent huge PUD. If file-backed support is added,
+ * that case would need to be handled here.
+ */
+ src_pmd = mm_find_pmd(mm, src_addr);
+ if (unlikely(!src_pmd)) {
+ if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
+ err = -ENOENT;
+ break;
+ }
+ src_pmd = mm_alloc_pmd(mm, src_addr);
+ if (unlikely(!src_pmd)) {
+ err = -ENOMEM;
+ break;
+ }
+ }
+ dst_pmd = mm_alloc_pmd(mm, dst_addr);
+ if (unlikely(!dst_pmd)) {
+ err = -ENOMEM;
+ break;
+ }
+
+ dst_pmdval = pmdp_get_lockless(dst_pmd);
+ /*
+ * If the dst_pmd is mapped as THP don't override it and just
+ * be strict. If dst_pmd changes into TPH after this check, the
+ * move_pages_huge_pmd() will detect the change and retry
+ * while move_pages_pte() will detect the change and fail.
+ */
+ if (unlikely(pmd_trans_huge(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
- uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+ ptl = pmd_trans_huge_lock(src_pmd, src_vma);
+ if (ptl) {
+ /* Check if we can move the pmd without splitting it. */
+ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
+ !pmd_none(dst_pmdval)) {
+ /* Can be a migration entry */
+ if (pmd_present(*src_pmd)) {
+ struct folio *folio = pmd_folio(*src_pmd);
+
+ if (!is_huge_zero_folio(folio) &&
+ !PageAnonExclusive(&folio->page)) {
+ spin_unlock(ptl);
+ err = -EBUSY;
+ break;
+ }
+ }
+
+ spin_unlock(ptl);
+ split_huge_pmd(src_vma, src_pmd, src_addr);
+ /* The folio will be split by move_pages_pte() */
+ continue;
+ }
+
+ err = move_pages_huge_pmd(mm, dst_pmd, src_pmd,
+ dst_pmdval, dst_vma, src_vma,
+ dst_addr, src_addr);
+ step_size = HPAGE_PMD_SIZE;
+ } else {
+ long ret;
+
+ if (pmd_none(*src_pmd)) {
+ if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
+ err = -ENOENT;
+ break;
+ }
+ if (unlikely(__pte_alloc(mm, src_pmd))) {
+ err = -ENOMEM;
+ break;
+ }
+ }
+
+ if (unlikely(pte_alloc(mm, dst_pmd))) {
+ err = -ENOMEM;
+ break;
+ }
+
+ ret = move_pages_ptes(mm, dst_pmd, src_pmd,
+ dst_vma, src_vma, dst_addr,
+ src_addr, src_end - src_addr, mode);
+ if (ret < 0)
+ err = ret;
+ else
+ step_size = ret;
+ }
+
+ cond_resched();
+
+ if (fatal_signal_pending(current)) {
+ /* Do not override an error */
+ if (!err || err == -EAGAIN)
+ err = -EINTR;
+ break;
+ }
+
+ if (err) {
+ if (err == -EAGAIN)
+ continue;
+ break;
+ }
+
+ /* Proceed to the next page */
+ dst_addr += step_size;
+ src_addr += step_size;
+ moved += step_size;
+ }
- err = 0;
out_unlock:
- mmap_read_unlock(dst_mm);
- return err;
+ up_read(&ctx->map_changing_lock);
+ uffd_move_unlock(dst_vma, src_vma);
+out:
+ VM_WARN_ON_ONCE(moved < 0);
+ VM_WARN_ON_ONCE(err > 0);
+ VM_WARN_ON_ONCE(!moved && !err);
+ return moved ? moved : err;
+}
+
+static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
+ vm_flags_t vm_flags)
+{
+ const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP;
+
+ vm_flags_reset(vma, vm_flags);
+ /*
+ * For shared mappings, we want to enable writenotify while
+ * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
+ * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
+ */
+ if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
+ vma_set_page_prot(vma);
+}
+
+static void userfaultfd_set_ctx(struct vm_area_struct *vma,
+ struct userfaultfd_ctx *ctx,
+ vm_flags_t vm_flags)
+{
+ vma_start_write(vma);
+ vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
+ userfaultfd_set_vm_flags(vma,
+ (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags);
+}
+
+void userfaultfd_reset_ctx(struct vm_area_struct *vma)
+{
+ userfaultfd_set_ctx(vma, NULL, 0);
+}
+
+struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
+{
+ struct vm_area_struct *ret;
+ bool give_up_on_oom = false;
+
+ /*
+ * If we are modifying only and not splitting, just give up on the merge
+ * if OOM prevents us from merging successfully.
+ */
+ if (start == vma->vm_start && end == vma->vm_end)
+ give_up_on_oom = true;
+
+ /* Reset ptes for the whole vma range if wr-protected */
+ if (userfaultfd_wp(vma))
+ uffd_wp_range(vma, start, end - start, false);
+
+ ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
+ vma->vm_flags & ~__VM_UFFD_FLAGS,
+ NULL_VM_UFFD_CTX, give_up_on_oom);
+
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ if (!IS_ERR(ret))
+ userfaultfd_reset_ctx(ret);
+
+ return ret;
+}
+
+/* Assumes mmap write lock taken, and mm_struct pinned. */
+int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *vma,
+ vm_flags_t vm_flags,
+ unsigned long start, unsigned long end,
+ bool wp_async)
+{
+ VMA_ITERATOR(vmi, ctx->mm, start);
+ struct vm_area_struct *prev = vma_prev(&vmi);
+ unsigned long vma_end;
+ vm_flags_t new_flags;
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ for_each_vma_range(vmi, vma, end) {
+ cond_resched();
+
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async));
+ VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx &&
+ vma->vm_userfaultfd_ctx.ctx != ctx);
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+ (vma->vm_flags & vm_flags) == vm_flags)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags,
+ (struct vm_userfaultfd_ctx){ctx},
+ /* give_up_on_oom = */false);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ userfaultfd_set_ctx(vma, ctx, vm_flags);
+
+ if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
+ hugetlb_unshare_all_pmds(vma);
+
+skip:
+ prev = vma;
+ start = vma->vm_end;
+ }
+
+ return 0;
+}
+
+void userfaultfd_release_new(struct userfaultfd_ctx *ctx)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ /* the various vma->vm_userfaultfd_ctx still points to it */
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
+ if (vma->vm_userfaultfd_ctx.ctx == ctx)
+ userfaultfd_reset_ctx(vma);
+ }
+ mmap_write_unlock(mm);
+}
+
+void userfaultfd_release_all(struct mm_struct *mm,
+ struct userfaultfd_ctx *ctx)
+{
+ struct vm_area_struct *vma, *prev;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ if (!mmget_not_zero(mm))
+ return;
+
+ /*
+ * Flush page faults out of all CPUs. NOTE: all page faults
+ * must be retried without returning VM_FAULT_SIGBUS if
+ * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
+ * changes while handle_userfault released the mmap_lock. So
+ * it's critical that released is set to true (above), before
+ * taking the mmap_lock for writing.
+ */
+ mmap_write_lock(mm);
+ prev = NULL;
+ for_each_vma(vmi, vma) {
+ cond_resched();
+ VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^
+ !!(vma->vm_flags & __VM_UFFD_FLAGS));
+ if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+ prev = vma;
+ continue;
+ }
+
+ vma = userfaultfd_clear_vma(&vmi, prev, vma,
+ vma->vm_start, vma->vm_end);
+ prev = vma;
+ }
+ mmap_write_unlock(mm);
+ mmput(mm);
}