summaryrefslogtreecommitdiff
path: root/mm/userfaultfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/userfaultfd.c')
-rw-r--r--mm/userfaultfd.c225
1 files changed, 125 insertions, 100 deletions
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 63a73e164d55..0e2132834bc7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -48,6 +48,78 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
return dst_vma;
}
+/*
+ * Install PTEs, to map dst_addr (within dst_vma) to page.
+ *
+ * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
+ * and anon, and for both shared and private VMAs.
+ */
+int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, struct page *page,
+ bool newly_allocated, bool wp_copy)
+{
+ int ret;
+ pte_t _dst_pte, *dst_pte;
+ bool writable = dst_vma->vm_flags & VM_WRITE;
+ bool vm_shared = dst_vma->vm_flags & VM_SHARED;
+ bool page_in_cache = page->mapping;
+ spinlock_t *ptl;
+ struct inode *inode;
+ pgoff_t offset, max_off;
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ if (page_in_cache && !vm_shared)
+ writable = false;
+ if (writable || !page_in_cache)
+ _dst_pte = pte_mkdirty(_dst_pte);
+ if (writable) {
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
+ else
+ _dst_pte = pte_mkwrite(_dst_pte);
+ }
+
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+
+ if (vma_is_shmem(dst_vma)) {
+ /* serialize against truncate with the page table lock */
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EFAULT;
+ if (unlikely(offset >= max_off))
+ goto out_unlock;
+ }
+
+ ret = -EEXIST;
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+
+ if (page_in_cache)
+ page_add_file_rmap(page, false);
+ else
+ page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+
+ /*
+ * Must happen after rmap, as mm_counter() checks mapping (via
+ * PageAnon()), which is set by __page_set_anon_rmap().
+ */
+ inc_mm_counter(dst_mm, mm_counter(page));
+
+ if (newly_allocated)
+ lru_cache_add_inactive_or_unevictable(page, dst_vma);
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
+}
+
static int mcopy_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
@@ -56,13 +128,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep,
bool wp_copy)
{
- pte_t _dst_pte, *dst_pte;
- spinlock_t *ptl;
void *page_kaddr;
int ret;
struct page *page;
- pgoff_t offset, max_off;
- struct inode *inode;
if (!*pagep) {
ret = -ENOMEM;
@@ -99,43 +167,12 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
goto out_release;
- _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
- if (dst_vma->vm_flags & VM_WRITE) {
- if (wp_copy)
- _dst_pte = pte_mkuffd_wp(_dst_pte);
- else
- _dst_pte = pte_mkwrite(_dst_pte);
- }
-
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
- if (dst_vma->vm_file) {
- /* the shmem MAP_PRIVATE case requires checking the i_size */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_release_uncharge_unlock;
- }
- ret = -EEXIST;
- if (!pte_none(*dst_pte))
- goto out_release_uncharge_unlock;
-
- inc_mm_counter(dst_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
-
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
-
- pte_unmap_unlock(dst_pte, ptl);
- ret = 0;
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, true, wp_copy);
+ if (ret)
+ goto out_release;
out:
return ret;
-out_release_uncharge_unlock:
- pte_unmap_unlock(dst_pte, ptl);
out_release:
put_page(page);
goto out;
@@ -176,6 +213,41 @@ out_unlock:
return ret;
}
+/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
+static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ bool wp_copy)
+{
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct page *page;
+ int ret;
+
+ ret = shmem_getpage(inode, pgoff, &page, SGP_READ);
+ if (ret)
+ goto out;
+ if (!page) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, false, wp_copy);
+ if (ret)
+ goto out_release;
+
+ unlock_page(page);
+ ret = 0;
+out:
+ return ret;
+out_release:
+ unlock_page(page);
+ put_page(page);
+ goto out;
+}
+
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
@@ -209,7 +281,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long len,
enum mcopy_atomic_mode mode)
{
- int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
@@ -308,7 +379,6 @@ retry:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
- vm_alloc_shared = vm_shared;
cond_resched();
@@ -346,54 +416,8 @@ retry:
out_unlock:
mmap_read_unlock(dst_mm);
out:
- if (page) {
- /*
- * We encountered an error and are about to free a newly
- * allocated huge page.
- *
- * Reservation handling is very subtle, and is different for
- * private and shared mappings. See the routine
- * restore_reserve_on_error for details. Unfortunately, we
- * can not call restore_reserve_on_error now as it would
- * require holding mmap_lock.
- *
- * If a reservation for the page existed in the reservation
- * map of a private mapping, the map was modified to indicate
- * the reservation was consumed when the page was allocated.
- * We clear the HPageRestoreReserve flag now so that the global
- * reserve count will not be incremented in free_huge_page.
- * The reservation map will still indicate the reservation
- * was consumed and possibly prevent later page allocation.
- * This is better than leaking a global reservation. If no
- * reservation existed, it is still safe to clear
- * HPageRestoreReserve as no adjustments to reservation counts
- * were made during allocation.
- *
- * The reservation map for shared mappings indicates which
- * pages have reservations. When a huge page is allocated
- * for an address with a reservation, no change is made to
- * the reserve map. In this case HPageRestoreReserve will be
- * set to indicate that the global reservation count should be
- * incremented when the page is freed. This is the desired
- * behavior. However, when a huge page is allocated for an
- * address without a reservation a reservation entry is added
- * to the reservation map, and HPageRestoreReserve will not be
- * set. When the page is freed, the global reserve count will
- * NOT be incremented and it will appear as though we have
- * leaked reserved page. In this case, set HPageRestoreReserve
- * so that the global reserve count will be incremented to
- * match the reservation map entry which was created.
- *
- * Note that vm_alloc_shared is based on the flags of the vma
- * for which the page was originally allocated. dst_vma could
- * be different or NULL on error.
- */
- if (vm_alloc_shared)
- SetHPageRestoreReserve(page);
- else
- ClearHPageRestoreReserve(page);
+ if (page)
put_page(page);
- }
BUG_ON(copied < 0);
BUG_ON(err > 0);
BUG_ON(!copied && !err);
@@ -415,11 +439,16 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
unsigned long dst_addr,
unsigned long src_addr,
struct page **page,
- bool zeropage,
+ enum mcopy_atomic_mode mode,
bool wp_copy)
{
ssize_t err;
+ if (mode == MCOPY_ATOMIC_CONTINUE) {
+ return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ wp_copy);
+ }
+
/*
* The normal page fault path for a shmem will invoke the
* fault, fill the hole in the file and COW it right away. The
@@ -431,7 +460,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
* and not in the radix tree.
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (!zeropage)
+ if (mode == MCOPY_ATOMIC_NORMAL)
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr, page,
wp_copy);
@@ -440,13 +469,10 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
dst_vma, dst_addr);
} else {
VM_WARN_ON_ONCE(wp_copy);
- if (!zeropage)
- err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr,
- src_addr, page);
- else
- err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr);
+ err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr,
+ mode != MCOPY_ATOMIC_NORMAL,
+ page);
}
return err;
@@ -467,7 +493,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
long copied;
struct page *page;
bool wp_copy;
- bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
/*
* Sanitize the command parameters:
@@ -530,7 +555,7 @@ retry:
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
- if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+ if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
goto out_unlock;
/*
@@ -578,7 +603,7 @@ retry:
BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, zeropage, wp_copy);
+ src_addr, &page, mcopy_mode, wp_copy);
cond_resched();
if (unlikely(err == -ENOENT)) {