diff options
Diffstat (limited to 'mm/userfaultfd.c')
| -rw-r--r-- | mm/userfaultfd.c | 810 |
1 files changed, 581 insertions, 229 deletions
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index defa5109cc62..e6dfd5f28acd 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -10,7 +10,7 @@ #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/userfaultfd_k.h> #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> @@ -18,6 +18,7 @@ #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" +#include "swap.h" static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) @@ -85,14 +86,10 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, mmap_read_lock(mm); vma = find_vma_and_prepare_anon(mm, address); if (!IS_ERR(vma)) { - /* - * We cannot use vma_start_read() as it may fail due to - * false locked (see comment in vma_start_read()). We - * can avoid that by directly locking vm_lock under - * mmap_lock, which guarantees that nobody can lock the - * vma for write (vma_start_write()) under us. - */ - down_read(&vma->vm_lock->lock); + bool locked = vma_start_read_locked(vma); + + if (!locked) + vma = ERR_PTR(-EAGAIN); } mmap_read_unlock(mm); @@ -181,6 +178,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, spinlock_t *ptl; struct folio *folio = page_folio(page); bool page_in_cache = folio_mapping(folio); + pte_t dst_ptep; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); @@ -202,12 +200,15 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, } ret = -EEXIST; + + dst_ptep = ptep_get(dst_pte); + /* - * We allow to overwrite a pte marker: consider when both MISSING|WP - * registered, we firstly wr-protect a none pte which has no page cache - * page backing it, then access the page. + * We are allowed to overwrite a UFFD pte marker: consider when both + * MISSING|WP registered, we firstly wr-protect a none pte which has no + * page cache page backing it, then access the page. */ - if (!pte_none_mostly(ptep_get(dst_pte))) + if (!pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_unlock; if (page_in_cache) { @@ -216,7 +217,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { - folio_add_new_anon_rmap(folio, dst_vma, dst_addr); + folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, dst_vma); } @@ -251,7 +252,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, - dst_addr, false); + dst_addr); if (!folio) goto out; @@ -391,7 +392,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct page *page; int ret; - ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; @@ -564,7 +565,7 @@ retry: } while (src_addr < src_start + len) { - BUG_ON(dst_addr >= dst_start + len); + VM_WARN_ON_ONCE(dst_addr >= dst_start + len); /* * Serialize via vma_lock and hugetlb_fault_mutex. @@ -586,12 +587,15 @@ retry: goto out_unlock; } - if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && - !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { - err = -EEXIST; - hugetlb_vma_unlock_read(dst_vma); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - goto out_unlock; + if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { + const pte_t ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); + + if (!huge_pte_none(ptep) && !pte_is_uffd_marker(ptep)) { + err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } } err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, @@ -605,7 +609,7 @@ retry: if (unlikely(err == -ENOENT)) { up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); - BUG_ON(!folio); + VM_WARN_ON_ONCE(!folio); err = copy_folio_from_user(folio, (const void __user *)src_addr, true); @@ -617,7 +621,7 @@ retry: dst_vma = NULL; goto retry; } else - BUG_ON(folio); + VM_WARN_ON_ONCE(folio); if (!err) { dst_addr += vma_hpagesize; @@ -638,9 +642,9 @@ out_unlock_vma: out: if (folio) folio_put(folio); - BUG_ON(copied < 0); - BUG_ON(err > 0); - BUG_ON(!copied && !err); + VM_WARN_ON_ONCE(copied < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!copied && !err); return copied ? copied : err; } #else /* !CONFIG_HUGETLB_PAGE */ @@ -714,12 +718,12 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, /* * Sanitize the command parameters: */ - BUG_ON(dst_start & ~PAGE_MASK); - BUG_ON(len & ~PAGE_MASK); + VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ - BUG_ON(src_start + len <= src_start); - BUG_ON(dst_start + len <= dst_start); + VM_WARN_ON_ONCE(src_start + len <= src_start); + VM_WARN_ON_ONCE(dst_start + len <= dst_start); src_addr = src_start; dst_addr = dst_start; @@ -778,7 +782,7 @@ retry: while (src_addr < src_start + len) { pmd_t dst_pmdval; - BUG_ON(dst_addr >= dst_start + len); + VM_WARN_ON_ONCE(dst_addr >= dst_start + len); dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); if (unlikely(!dst_pmd)) { @@ -787,27 +791,30 @@ retry: } dst_pmdval = pmdp_get_lockless(dst_pmd); - /* - * If the dst_pmd is mapped as THP don't - * override it and just be strict. - */ - if (unlikely(pmd_trans_huge(dst_pmdval))) { - err = -EEXIST; - break; - } if (unlikely(pmd_none(dst_pmdval)) && unlikely(__pte_alloc(dst_mm, dst_pmd))) { err = -ENOMEM; break; } - /* If an huge pmd materialized from under us fail */ - if (unlikely(pmd_trans_huge(*dst_pmd))) { + dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is THP don't override it and just be strict. + * (This includes the case where the PMD used to be THP and + * changed back to none after __pte_alloc().) + */ + if (unlikely(!pmd_present(dst_pmdval) || + pmd_trans_huge(dst_pmdval))) { + err = -EEXIST; + break; + } + if (unlikely(pmd_bad(dst_pmdval))) { err = -EFAULT; break; } - - BUG_ON(pmd_none(*dst_pmd)); - BUG_ON(pmd_trans_huge(*dst_pmd)); + /* + * For shmem mappings, khugepaged is allowed to remove page + * tables under us; pte_offset_map_lock() will deal with that. + */ err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, &folio); @@ -818,7 +825,7 @@ retry: up_read(&ctx->map_changing_lock); uffd_mfill_unlock(dst_vma); - BUG_ON(!folio); + VM_WARN_ON_ONCE(!folio); kaddr = kmap_local_folio(folio, 0); err = copy_from_user(kaddr, @@ -832,7 +839,7 @@ retry: flush_dcache_folio(folio); goto retry; } else - BUG_ON(folio); + VM_WARN_ON_ONCE(folio); if (!err) { dst_addr += PAGE_SIZE; @@ -852,9 +859,9 @@ out_unlock: out: if (folio) folio_put(folio); - BUG_ON(copied < 0); - BUG_ON(err > 0); - BUG_ON(!copied && !err); + VM_WARN_ON_ONCE(copied < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!copied && !err); return copied ? copied : err; } @@ -940,11 +947,11 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, /* * Sanitize the command parameters: */ - BUG_ON(start & ~PAGE_MASK); - BUG_ON(len & ~PAGE_MASK); + VM_WARN_ON_ONCE(start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ - BUG_ON(start + len <= start); + VM_WARN_ON_ONCE(start + len <= start); mmap_read_lock(dst_mm); @@ -995,14 +1002,8 @@ void double_pt_lock(spinlock_t *ptl1, __acquires(ptl1) __acquires(ptl2) { - spinlock_t *ptl_tmp; - - if (ptl1 > ptl2) { - /* exchange ptl1 and ptl2 */ - ptl_tmp = ptl1; - ptl1 = ptl2; - ptl2 = ptl_tmp; - } + if (ptl1 > ptl2) + swap(ptl1, ptl2); /* lock in virtual address order to avoid lock inversion */ spin_lock(ptl1); if (ptl1 != ptl2) @@ -1023,22 +1024,73 @@ void double_pt_unlock(spinlock_t *ptl1, __release(ptl2); } +static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval) +{ + return pte_same(ptep_get(src_pte), orig_src_pte) && + pte_same(ptep_get(dst_pte), orig_dst_pte) && + pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); +} -static int move_present_pte(struct mm_struct *mm, - struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, - unsigned long dst_addr, unsigned long src_addr, - pte_t *dst_pte, pte_t *src_pte, - pte_t orig_dst_pte, pte_t orig_src_pte, - spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio *src_folio) +/* + * Checks if the two ptes and the corresponding folio are eligible for batched + * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. + * + * NOTE: folio's reference is not required as the whole operation is within + * PTL's critical section. + */ +static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, + unsigned long src_addr, + pte_t *src_pte, pte_t *dst_pte) +{ + pte_t orig_dst_pte, orig_src_pte; + struct folio *folio; + + orig_dst_pte = ptep_get(dst_pte); + if (!pte_none(orig_dst_pte)) + return NULL; + + orig_src_pte = ptep_get(src_pte); + if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) + return NULL; + + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); + if (!folio || !folio_trylock(folio)) + return NULL; + if (!PageAnonExclusive(&folio->page) || folio_test_large(folio)) { + folio_unlock(folio); + return NULL; + } + return folio; +} + +/* + * Moves src folios to dst in a batch as long as they are not large, and can + * successfully take the lock via folio_trylock(). + */ +static long move_present_ptes(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio **first_src_folio, unsigned long len) { int err = 0; + struct folio *src_folio = *first_src_folio; + unsigned long src_start = src_addr; + unsigned long src_end; + len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; + src_end = pmd_addr_end(src_addr, src_addr + len); + flush_cache_range(src_vma, src_addr, src_end); double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { err = -EAGAIN; goto out; } @@ -1048,50 +1100,119 @@ static int move_present_pte(struct mm_struct *mm, err = -EBUSY; goto out; } + /* It's safe to drop the reference now as the page-table is holding one. */ + folio_put(*first_src_folio); + *first_src_folio = NULL; + arch_enter_lazy_mmu_mode(); + + while (true) { + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); + /* Folio got pinned from under us. Put it back and fail the move. */ + if (folio_maybe_dma_pinned(src_folio)) { + set_pte_at(mm, src_addr, src_pte, orig_src_pte); + err = -EBUSY; + break; + } - orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); - /* Folio got pinned from under us. Put it back and fail the move. */ - if (folio_maybe_dma_pinned(src_folio)) { - set_pte_at(mm, src_addr, src_pte, orig_src_pte); - err = -EBUSY; - goto out; - } + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); + + orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); + /* Set soft dirty bit so userspace can notice the pte was moved */ + if (pgtable_supports_soft_dirty()) + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); + if (pte_dirty(orig_src_pte)) + orig_dst_pte = pte_mkdirty(orig_dst_pte); + orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); + set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); + + src_addr += PAGE_SIZE; + if (src_addr == src_end) + break; + dst_addr += PAGE_SIZE; + dst_pte++; + src_pte++; - folio_move_anon_rmap(src_folio, dst_vma); - src_folio->index = linear_page_index(dst_vma, dst_addr); + folio_unlock(src_folio); + src_folio = check_ptes_for_batched_move(src_vma, src_addr, + src_pte, dst_pte); + if (!src_folio) + break; + } - orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); - /* Follow mremap() behavior and treat the entry dirty after the move */ - orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); + arch_leave_lazy_mmu_mode(); + if (src_addr > src_start) + flush_tlb_range(src_vma, src_start, src_addr); - set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); + if (src_folio) + folio_unlock(src_folio); out: double_pt_unlock(dst_ptl, src_ptl); - return err; + return src_addr > src_start ? src_addr - src_start : err; } -static int move_swap_pte(struct mm_struct *mm, +static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, - spinlock_t *dst_ptl, spinlock_t *src_ptl) + pmd_t *dst_pmd, pmd_t dst_pmdval, + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio *src_folio, + struct swap_info_struct *si, swp_entry_t entry) { - if (!pte_swp_exclusive(orig_src_pte)) - return -EBUSY; + /* + * Check if the folio still belongs to the target swap entry after + * acquiring the lock. Folio can be freed in the swap cache while + * not locked. + */ + if (src_folio && unlikely(!folio_test_swapcache(src_folio) || + entry.val != src_folio->swap.val)) + return -EAGAIN; double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } + /* + * The src_folio resides in the swapcache, requiring an update to its + * index and mapping to align with the dst_vma, where a swap-in may + * occur and hit the swapcache after moving the PTE. + */ + if (src_folio) { + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); + } else { + /* + * Check if the swap entry is cached after acquiring the src_pte + * lock. Otherwise, we might miss a newly loaded swap cache folio. + * + * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. + * We are trying to catch newly added swap cache, the only possible case is + * when a folio is swapped in and out again staying in swap cache, using the + * same entry before the PTE check above. The PTL is acquired and released + * twice, each time after updating the swap_map's flag. So holding + * the PTL here ensures we see the updated value. False positive is possible, + * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the + * cache, or during the tiny synchronization window between swap cache and + * swap_map, but it will be gone very quickly, worst result is retry jitters. + */ + if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } + } + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); + if (pgtable_supports_soft_dirty()) + orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); - return 0; + return PAGE_SIZE; } static int move_zeropage_pte(struct mm_struct *mm, @@ -1100,13 +1221,14 @@ static int move_zeropage_pte(struct mm_struct *mm, unsigned long dst_addr, unsigned long src_addr, pte_t *dst_pte, pte_t *src_pte, pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl) { pte_t zero_pte; double_pt_lock(dst_ptl, src_ptl); - if (!pte_same(ptep_get(src_pte), orig_src_pte) || - !pte_same(ptep_get(dst_pte), orig_dst_pte)) { + if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, + dst_pmd, dst_pmdval)) { double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN; } @@ -1117,47 +1239,59 @@ static int move_zeropage_pte(struct mm_struct *mm, set_pte_at(mm, dst_addr, dst_pte, zero_pte); double_pt_unlock(dst_ptl, src_ptl); - return 0; + return PAGE_SIZE; } /* - * The mmap_lock for reading is held by the caller. Just move the page - * from src_pmd to dst_pmd if possible, and return true if succeeded - * in moving the page. + * The mmap_lock for reading is held by the caller. Just move the page(s) + * from src_pmd to dst_pmd if possible, and return number of bytes moved. + * On failure, an error code is returned. */ -static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, - struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, - unsigned long dst_addr, unsigned long src_addr, - __u64 mode) +static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + unsigned long len, __u64 mode) { - swp_entry_t entry; + struct swap_info_struct *si = NULL; pte_t orig_src_pte, orig_dst_pte; pte_t src_folio_pte; spinlock_t *src_ptl, *dst_ptl; pte_t *src_pte = NULL; pte_t *dst_pte = NULL; - + pmd_t dummy_pmdval; + pmd_t dst_pmdval; struct folio *src_folio = NULL; - struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; - int err = 0; + long ret = 0; - flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, - src_addr, src_addr + PAGE_SIZE); + src_addr, src_addr + len); mmu_notifier_invalidate_range_start(&range); retry: - dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl); + /* + * Use the maywrite version to indicate that dst_pte will be modified, + * since dst_pte needs to be none, the subsequent pte_same() check + * cannot prevent the dst_pte page from being freed concurrently, so we + * also need to abtain dst_pmdval and recheck pmd_same() later. + */ + dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, + &dst_ptl); /* Retry if a huge pmd materialized from under us */ if (unlikely(!dst_pte)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } - src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl); + /* + * Unlike dst_pte, the subsequent pte_same() check can ensure the + * stability of the src_pte page, so there is no need to get pmdval, + * just pass a dummy variable to it. + */ + src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, + &src_ptl); /* * We held the mmap_lock for reading so MADV_DONTNEED @@ -1166,14 +1300,14 @@ retry: * transparent huge pages under us. */ if (unlikely(!src_pte)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* Sanity checks before the operation */ - if (WARN_ON_ONCE(pmd_none(*dst_pmd)) || WARN_ON_ONCE(pmd_none(*src_pmd)) || - WARN_ON_ONCE(pmd_trans_huge(*dst_pmd)) || WARN_ON_ONCE(pmd_trans_huge(*src_pmd))) { - err = -EINVAL; + if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || + pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { + ret = -EINVAL; goto out; } @@ -1181,7 +1315,7 @@ retry: orig_dst_pte = ptep_get(dst_pte); spin_unlock(dst_ptl); if (!pte_none(orig_dst_pte)) { - err = -EEXIST; + ret = -EEXIST; goto out; } @@ -1190,34 +1324,35 @@ retry: spin_unlock(src_ptl); if (pte_none(orig_src_pte)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) - err = -ENOENT; + ret = -ENOENT; else /* nothing to do to move a hole */ - err = 0; + ret = PAGE_SIZE; goto out; } /* If PTE changed after we locked the folio them start over */ if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } if (pte_present(orig_src_pte)) { if (is_zero_pfn(pte_pfn(orig_src_pte))) { - err = move_zeropage_pte(mm, dst_vma, src_vma, + ret = move_zeropage_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl); + dst_pmd, dst_pmdval, dst_ptl, src_ptl); goto out; } /* - * Pin and lock both source folio and anon_vma. Since we are in - * RCU read section, we can't block, so on contention have to - * unmap the ptes, obtain the lock and retry. + * Pin and lock source folio. Since we are in RCU read section, + * we can't block, so on contention have to unmap the ptes, + * obtain the lock and retry. */ if (!src_folio) { struct folio *folio; + bool locked; /* * Pin the page while holding the lock to be sure the @@ -1226,14 +1361,28 @@ retry: spin_lock(src_ptl); if (!pte_same(orig_src_pte, ptep_get(src_pte))) { spin_unlock(src_ptl); - err = -EAGAIN; + ret = -EAGAIN; goto out; } folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !PageAnonExclusive(&folio->page)) { spin_unlock(src_ptl); - err = -EBUSY; + ret = -EBUSY; + goto out; + } + + locked = folio_trylock(folio); + /* + * We avoid waiting for folio lock with a raised + * refcount for large folios because extra refcounts + * will result in split_folio() failing later and + * retrying. If multiple tasks are trying to move a + * large folio we can end up livelocking. + */ + if (!locked && folio_test_large(folio)) { + spin_unlock(src_ptl); + ret = -EAGAIN; goto out; } @@ -1242,9 +1391,9 @@ retry: src_folio_pte = orig_src_pte; spin_unlock(src_ptl); - if (!folio_trylock(src_folio)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + if (!locked) { + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; /* now we can block and wait */ folio_lock(src_folio); @@ -1252,7 +1401,7 @@ retry: } if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { - err = -EBUSY; + ret = -EBUSY; goto out; } } @@ -1260,11 +1409,11 @@ retry: /* at this point we have src_folio locked */ if (folio_test_large(src_folio)) { /* split_folio() can block */ - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; - err = split_folio(src_folio); - if (err) + ret = split_folio(src_folio); + if (ret) goto out; /* have to reacquire the folio after it got split */ folio_unlock(src_folio); @@ -1273,68 +1422,95 @@ retry: goto retry; } - if (!src_anon_vma) { - /* - * folio_referenced walks the anon_vma chain - * without the folio lock. Serialize against it with - * the anon_vma lock, the folio lock is not enough. - */ - src_anon_vma = folio_get_anon_vma(src_folio); - if (!src_anon_vma) { - /* page was unmapped from under us */ - err = -EAGAIN; + ret = move_present_ptes(mm, dst_vma, src_vma, + dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl, &src_folio, + len); + } else { /* !pte_present() */ + struct folio *folio = NULL; + const softleaf_t entry = softleaf_from_pte(orig_src_pte); + + if (softleaf_is_migration(entry)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); + src_pte = dst_pte = NULL; + migration_entry_wait(mm, src_pmd, src_addr); + + ret = -EAGAIN; + goto out; + } else if (!softleaf_is_swap(entry)) { + ret = -EFAULT; + goto out; + } + + if (!pte_swp_exclusive(orig_src_pte)) { + ret = -EBUSY; + goto out; + } + + si = get_swap_device(entry); + if (unlikely(!si)) { + ret = -EAGAIN; + goto out; + } + /* + * Verify the existence of the swapcache. If present, the folio's + * index and mapping must be updated even when the PTE is a swap + * entry. The anon_vma lock is not taken during this process since + * the folio has already been unmapped, and the swap entry is + * exclusive, preventing rmap walks. + * + * For large folios, return -EBUSY immediately, as split_folio() + * also returns -EBUSY when attempting to split unmapped large + * folios in the swapcache. This issue needs to be resolved + * separately to allow proper handling. + */ + if (!src_folio) + folio = swap_cache_get_folio(entry); + if (folio) { + if (folio_test_large(folio)) { + ret = -EBUSY; + folio_put(folio); goto out; } - if (!anon_vma_trylock_write(src_anon_vma)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); + src_folio = folio; + src_folio_pte = orig_src_pte; + if (!folio_trylock(src_folio)) { + pte_unmap(src_pte); + pte_unmap(dst_pte); src_pte = dst_pte = NULL; + put_swap_device(si); + si = NULL; /* now we can block and wait */ - anon_vma_lock_write(src_anon_vma); + folio_lock(src_folio); goto retry; } } - - err = move_present_pte(mm, dst_vma, src_vma, - dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl, src_folio); - } else { - entry = pte_to_swp_entry(orig_src_pte); - if (non_swap_entry(entry)) { - if (is_migration_entry(entry)) { - pte_unmap(&orig_src_pte); - pte_unmap(&orig_dst_pte); - src_pte = dst_pte = NULL; - migration_entry_wait(mm, src_pmd, src_addr); - err = -EAGAIN; - } else - err = -EFAULT; - goto out; - } - - err = move_swap_pte(mm, dst_addr, src_addr, - dst_pte, src_pte, - orig_dst_pte, orig_src_pte, - dst_ptl, src_ptl); + ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, + dst_ptl, src_ptl, src_folio, si, entry); } out: - if (src_anon_vma) { - anon_vma_unlock_write(src_anon_vma); - put_anon_vma(src_anon_vma); - } if (src_folio) { folio_unlock(src_folio); folio_put(src_folio); } - if (dst_pte) - pte_unmap(dst_pte); + /* + * Unmap in reverse order (LIFO) to maintain proper kmap_local + * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte + * first, then src_pte, so we must unmap src_pte first, then dst_pte. + */ if (src_pte) pte_unmap(src_pte); + if (dst_pte) + pte_unmap(dst_pte); mmu_notifier_invalidate_range_end(&range); + if (si) + put_swap_device(si); - return err; + return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1376,7 +1552,7 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx, /* * For now, we keep it simple and only move between writable VMAs. - * Access flags are equal, therefore cheching only the source is enough. + * Access flags are equal, therefore checking only the source is enough. */ if (!(src_vma->vm_flags & VM_WRITE)) return -EINVAL; @@ -1469,16 +1645,24 @@ static int uffd_move_lock(struct mm_struct *mm, mmap_read_lock(mm); err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); - if (!err) { - /* - * See comment in uffd_lock_vma() as to why not using - * vma_start_read() here. - */ - down_read(&(*dst_vmap)->vm_lock->lock); - if (*dst_vmap != *src_vmap) - down_read_nested(&(*src_vmap)->vm_lock->lock, - SINGLE_DEPTH_NESTING); + if (err) + goto out; + + if (!vma_start_read_locked(*dst_vmap)) { + err = -EAGAIN; + goto out; + } + + /* Nothing further to do if both vmas are locked. */ + if (*dst_vmap == *src_vmap) + goto out; + + if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { + /* Undo dst_vmap locking if src_vmap failed to lock */ + vma_end_read(*dst_vmap); + err = -EAGAIN; } +out: mmap_read_unlock(mm); return err; } @@ -1582,36 +1766,25 @@ static void uffd_move_unlock(struct vm_area_struct *dst_vma, * virtual regions without knowing if there are transparent hugepage * in the regions or not, but preventing the risk of having to split * the hugepmd during the remap. - * - * If there's any rmap walk that is taking the anon_vma locks without - * first obtaining the folio lock (the only current instance is - * folio_referenced), they will have to verify if the folio->mapping - * has changed after taking the anon_vma lock. If it changed they - * should release the lock and retry obtaining a new anon_vma, because - * it means the anon_vma was changed by move_pages() before the lock - * could be obtained. This is the only additional complexity added to - * the rmap code to provide this anonymous page remapping functionality. */ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, __u64 mode) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *src_vma, *dst_vma; - unsigned long src_addr, dst_addr; + unsigned long src_addr, dst_addr, src_end; pmd_t *src_pmd, *dst_pmd; long err = -EINVAL; ssize_t moved = 0; /* Sanitize the command parameters. */ - if (WARN_ON_ONCE(src_start & ~PAGE_MASK) || - WARN_ON_ONCE(dst_start & ~PAGE_MASK) || - WARN_ON_ONCE(len & ~PAGE_MASK)) - goto out; + VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); + VM_WARN_ON_ONCE(len & ~PAGE_MASK); /* Does the address range wrap, or is the span zero-sized? */ - if (WARN_ON_ONCE(src_start + len <= src_start) || - WARN_ON_ONCE(dst_start + len <= dst_start)) - goto out; + VM_WARN_ON_ONCE(src_start + len < src_start); + VM_WARN_ON_ONCE(dst_start + len < dst_start); err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); if (err) @@ -1642,8 +1815,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, if (err) goto out_unlock; - for (src_addr = src_start, dst_addr = dst_start; - src_addr < src_start + len;) { + for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; + src_addr < src_end;) { spinlock_t *ptl; pmd_t dst_pmdval; unsigned long step_size; @@ -1685,22 +1858,19 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, ptl = pmd_trans_huge_lock(src_pmd, src_vma); if (ptl) { - if (pmd_devmap(*src_pmd)) { - spin_unlock(ptl); - err = -ENOENT; - break; - } - /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || !pmd_none(dst_pmdval)) { - struct folio *folio = pmd_folio(*src_pmd); - - if (!folio || (!is_huge_zero_folio(folio) && - !PageAnonExclusive(&folio->page))) { - spin_unlock(ptl); - err = -EBUSY; - break; + /* Can be a migration entry */ + if (pmd_present(*src_pmd)) { + struct folio *folio = pmd_folio(*src_pmd); + + if (!is_huge_zero_folio(folio) && + !PageAnonExclusive(&folio->page)) { + spin_unlock(ptl); + err = -EBUSY; + break; + } } spin_unlock(ptl); @@ -1714,6 +1884,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, dst_addr, src_addr); step_size = HPAGE_PMD_SIZE; } else { + long ret; + if (pmd_none(*src_pmd)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { err = -ENOENT; @@ -1730,10 +1902,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, break; } - err = move_pages_pte(mm, dst_pmd, src_pmd, - dst_vma, src_vma, - dst_addr, src_addr, mode); - step_size = PAGE_SIZE; + ret = move_pages_ptes(mm, dst_pmd, src_pmd, + dst_vma, src_vma, dst_addr, + src_addr, src_end - src_addr, mode); + if (ret < 0) + err = ret; + else + step_size = ret; } cond_resched(); @@ -1761,8 +1936,185 @@ out_unlock: up_read(&ctx->map_changing_lock); uffd_move_unlock(dst_vma, src_vma); out: - VM_WARN_ON(moved < 0); - VM_WARN_ON(err > 0); - VM_WARN_ON(!moved && !err); + VM_WARN_ON_ONCE(moved < 0); + VM_WARN_ON_ONCE(err > 0); + VM_WARN_ON_ONCE(!moved && !err); return moved ? moved : err; } + +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; + + vm_flags_reset(vma, vm_flags); + /* + * For shared mappings, we want to enable writenotify while + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply + * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. + */ + if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) + vma_set_page_prot(vma); +} + +static void userfaultfd_set_ctx(struct vm_area_struct *vma, + struct userfaultfd_ctx *ctx, + vm_flags_t vm_flags) +{ + vma_start_write(vma); + vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; + userfaultfd_set_vm_flags(vma, + (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); +} + +void userfaultfd_reset_ctx(struct vm_area_struct *vma) +{ + userfaultfd_set_ctx(vma, NULL, 0); +} + +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + struct vm_area_struct *ret; + bool give_up_on_oom = false; + + /* + * If we are modifying only and not splitting, just give up on the merge + * if OOM prevents us from merging successfully. + */ + if (start == vma->vm_start && end == vma->vm_end) + give_up_on_oom = true; + + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(vma, start, end - start, false); + + ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, + vma->vm_flags & ~__VM_UFFD_FLAGS, + NULL_VM_UFFD_CTX, give_up_on_oom); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + if (!IS_ERR(ret)) + userfaultfd_reset_ctx(ret); + + return ret; +} + +/* Assumes mmap write lock taken, and mm_struct pinned. */ +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, + vm_flags_t vm_flags, + unsigned long start, unsigned long end, + bool wp_async) +{ + VMA_ITERATOR(vmi, ctx->mm, start); + struct vm_area_struct *prev = vma_prev(&vmi); + unsigned long vma_end; + vm_flags_t new_flags; + + if (vma->vm_start < start) + prev = vma; + + for_each_vma_range(vmi, vma, end) { + cond_resched(); + + VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); + VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && + vma->vm_userfaultfd_ctx.ctx != ctx); + VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); + + /* + * Nothing to do: this vma is already registered into this + * userfaultfd and with the right tracking mode too. + */ + if (vma->vm_userfaultfd_ctx.ctx == ctx && + (vma->vm_flags & vm_flags) == vm_flags) + goto skip; + + if (vma->vm_start > start) + start = vma->vm_start; + vma_end = min(end, vma->vm_end); + + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, + (struct vm_userfaultfd_ctx){ctx}, + /* give_up_on_oom = */false); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + /* + * In the vma_merge() successful mprotect-like case 8: + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ + userfaultfd_set_ctx(vma, ctx, vm_flags); + + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) + hugetlb_unshare_all_pmds(vma); + +skip: + prev = vma; + start = vma->vm_end; + } + + return 0; +} + +void userfaultfd_release_new(struct userfaultfd_ctx *ctx) +{ + struct mm_struct *mm = ctx->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + /* the various vma->vm_userfaultfd_ctx still points to it */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + if (vma->vm_userfaultfd_ctx.ctx == ctx) + userfaultfd_reset_ctx(vma); + } + mmap_write_unlock(mm); +} + +void userfaultfd_release_all(struct mm_struct *mm, + struct userfaultfd_ctx *ctx) +{ + struct vm_area_struct *vma, *prev; + VMA_ITERATOR(vmi, mm, 0); + + if (!mmget_not_zero(mm)) + return; + + /* + * Flush page faults out of all CPUs. NOTE: all page faults + * must be retried without returning VM_FAULT_SIGBUS if + * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx + * changes while handle_userfault released the mmap_lock. So + * it's critical that released is set to true (above), before + * taking the mmap_lock for writing. + */ + mmap_write_lock(mm); + prev = NULL; + for_each_vma(vmi, vma) { + cond_resched(); + VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ + !!(vma->vm_flags & __VM_UFFD_FLAGS)); + if (vma->vm_userfaultfd_ctx.ctx != ctx) { + prev = vma; + continue; + } + + vma = userfaultfd_clear_vma(&vmi, prev, vma, + vma->vm_start, vma->vm_end); + prev = vma; + } + mmap_write_unlock(mm); + mmput(mm); +} |
