diff options
Diffstat (limited to 'mm/gup.c')
| -rw-r--r-- | mm/gup.c | 1303 |
1 files changed, 591 insertions, 712 deletions
@@ -5,6 +5,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> +#include <linux/memfd.h> #include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> @@ -17,6 +18,7 @@ #include <linux/hugetlb.h> #include <linux/migrate.h> #include <linux/mm_inline.h> +#include <linux/pagevec.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> @@ -24,11 +26,7 @@ #include <asm/tlbflush.h> #include "internal.h" - -struct follow_page_context { - struct dev_pagemap *pgmap; - unsigned int page_mask; -}; +#include "swap.h" static inline void sanity_check_pinned_pages(struct page **pages, unsigned long npages) @@ -50,17 +48,22 @@ static inline void sanity_check_pinned_pages(struct page **pages, */ for (; npages; npages--, pages++) { struct page *page = *pages; - struct folio *folio = page_folio(page); + struct folio *folio; + + if (!page) + continue; + + folio = page_folio(page); if (is_zero_page(page) || !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio)) - VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); + VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio); else /* Either a PTE-mapped or a PMD-mapped THP. */ - VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) && - !PageAnonExclusive(page), page); + VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) && + !PageAnonExclusive(page), page); } } @@ -89,8 +92,7 @@ retry: * belongs to this folio. */ if (unlikely(page_folio(page) != folio)) { - if (!put_devmap_managed_folio_refs(folio, refs)) - folio_put_refs(folio, refs); + folio_put_refs(folio, refs); goto retry; } @@ -103,14 +105,13 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) if (is_zero_folio(folio)) return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); - if (folio_test_large(folio)) + if (folio_has_pincount(folio)) atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; } - if (!put_devmap_managed_folio_refs(folio, refs)) - folio_put_refs(folio, refs); + folio_put_refs(folio, refs); } /** @@ -142,7 +143,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio))) return -EREMOTEIO; if (flags & FOLL_GET) @@ -159,7 +160,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, * Increment the normal page refcount field at least once, * so that the page really is pinned. */ - if (folio_test_large(folio)) { + if (folio_has_pincount(folio)) { folio_ref_add(folio, refs); atomic_add(refs, &folio->_pincount); } else { @@ -189,6 +190,19 @@ void unpin_user_page(struct page *page) EXPORT_SYMBOL(unpin_user_page); /** + * unpin_folio() - release a dma-pinned folio + * @folio: pointer to folio to be released + * + * Folios that were pinned via memfd_pin_folios() or other similar routines + * must be released either using unpin_folio() or unpin_folios(). + */ +void unpin_folio(struct folio *folio) +{ + gup_put_folio(folio, 1, FOLL_PIN); +} +EXPORT_SYMBOL_GPL(unpin_folio); + +/** * folio_add_pin - Try to get an additional pin on a pinned folio * @folio: The folio to be pinned * @@ -205,7 +219,7 @@ void folio_add_pin(struct folio *folio) * page refcount field at least once, so that the page really is * pinned. */ - if (folio_test_large(folio)) { + if (folio_has_pincount(folio)) { WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1); folio_ref_inc(folio); atomic_inc(&folio->_pincount); @@ -218,7 +232,7 @@ void folio_add_pin(struct folio *folio) static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { - struct page *next = nth_page(start, i); + struct page *next = start + i; struct folio *folio = page_folio(next); unsigned int nr = 1; @@ -290,7 +304,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, * 1) This code sees the page as already dirty, so it * skips the call to set_page_dirty(). That could happen * because clear_page_dirty_for_io() called - * page_mkclean(), followed by set_page_dirty(). + * folio_mkclean(), followed by set_page_dirty(). * However, now the page is going to get written back, * which meets the original intention of setting it * dirty, so all is well: clear_page_dirty_for_io() goes @@ -323,6 +337,10 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock); * "gup-pinned page range" refers to a range of pages that has had one of the * pin_user_pages() variants called on that page. * + * The page range must be truly physically contiguous: the page range + * corresponds to a contiguous PFN range and all pages can be iterated + * naturally. + * * For the page ranges defined by [page .. page+npages], make that range (or * its head pages, if a compound page) dirty, if @make_dirty is true, and if the * page range was previously listed as clean. @@ -340,6 +358,8 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, struct folio *folio; unsigned int nr; + VM_WARN_ON_ONCE(!page_range_contiguous(page, npages)); + for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { @@ -394,40 +414,77 @@ void unpin_user_pages(struct page **pages, unsigned long npages) sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { + if (!pages[i]) { + nr = 1; + continue; + } folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages); +/** + * unpin_user_folio() - release pages of a folio + * @folio: pointer to folio to be released + * @npages: number of pages of same folio + * + * Release npages of the folio + */ +void unpin_user_folio(struct folio *folio, unsigned long npages) +{ + gup_put_folio(folio, npages, FOLL_PIN); +} +EXPORT_SYMBOL(unpin_user_folio); + +/** + * unpin_folios() - release an array of gup-pinned folios. + * @folios: array of folios to be marked dirty and released. + * @nfolios: number of folios in the @folios array. + * + * For each folio in the @folios array, release the folio using gup_put_folio. + * + * Please see the unpin_folio() documentation for details. + */ +void unpin_folios(struct folio **folios, unsigned long nfolios) +{ + unsigned long i = 0, j; + + /* + * If this WARN_ON() fires, then the system *might* be leaking folios + * (by leaving them pinned), but probably not. More likely, gup/pup + * returned a hard -ERRNO error to the caller, who erroneously passed + * it here. + */ + if (WARN_ON(IS_ERR_VALUE(nfolios))) + return; + + while (i < nfolios) { + for (j = i + 1; j < nfolios; j++) + if (folios[i] != folios[j]) + break; + + if (folios[i]) + gup_put_folio(folios[i], j - i, FOLL_PIN); + i = j; + } +} +EXPORT_SYMBOL_GPL(unpin_folios); + /* * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ -static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) +static inline void mm_set_has_pinned_flag(struct mm_struct *mm) { - if (!test_bit(MMF_HAS_PINNED, mm_flags)) - set_bit(MMF_HAS_PINNED, mm_flags); + if (!mm_flags_test(MMF_HAS_PINNED, mm)) + mm_flags_set(MMF_HAS_PINNED, mm); } #ifdef CONFIG_MMU -#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_GUP_FAST) -static int record_subpages(struct page *page, unsigned long sz, - unsigned long addr, unsigned long end, - struct page **pages) -{ - struct page *start_page; - int nr; - - start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT); - for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) - pages[nr] = nth_page(start_page, nr); - - return nr; -} - +#ifdef CONFIG_HAVE_GUP_FAST /** * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. * @page: pointer to page to be grabbed @@ -494,8 +551,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs, */ if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { - if (!put_devmap_managed_folio_refs(folio, refs)) - folio_put_refs(folio, refs); + folio_put_refs(folio, refs); return NULL; } @@ -507,7 +563,7 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs, * is pinned. That's why the refcount from the earlier * try_get_folio() is left intact. */ - if (folio_test_large(folio)) + if (folio_has_pincount(folio)) atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, @@ -523,154 +579,34 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs, return folio; } -#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */ - -#ifdef CONFIG_ARCH_HAS_HUGEPD -static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, - unsigned long sz) -{ - unsigned long __boundary = (addr + sz) & ~(sz-1); - return (__boundary - 1 < end - 1) ? __boundary : end; -} - -/* - * Returns 1 if succeeded, 0 if failed, -EMLINK if unshare needed. - * - * NOTE: for the same entry, gup-fast and gup-slow can return different - * results (0 v.s. -EMLINK) depending on whether vma is available. This is - * the expected behavior, where we simply want gup-fast to fallback to - * gup-slow to take the vma reference first. - */ -static int gup_hugepte(struct vm_area_struct *vma, pte_t *ptep, unsigned long sz, - unsigned long addr, unsigned long end, unsigned int flags, - struct page **pages, int *nr, bool fast) -{ - unsigned long pte_end; - struct page *page; - struct folio *folio; - pte_t pte; - int refs; - - pte_end = (addr + sz) & ~(sz-1); - if (pte_end < end) - end = pte_end; - - pte = huge_ptep_get(ptep); - - if (!pte_access_permitted(pte, flags & FOLL_WRITE)) - return 0; - - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - page = pte_page(pte); - refs = record_subpages(page, sz, addr, end, pages + *nr); - - if (fast) { - folio = try_grab_folio_fast(page, refs, flags); - if (!folio) - return 0; - } else { - folio = page_folio(page); - if (try_grab_folio(folio, refs, flags)) - return 0; - } - - if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!pte_write(pte) && gup_must_unshare(vma, flags, &folio->page)) { - gup_put_folio(folio, refs, flags); - return -EMLINK; - } - - *nr += refs; - folio_set_referenced(folio); - return 1; -} +#endif /* CONFIG_HAVE_GUP_FAST */ -/* - * NOTE: currently GUP for a hugepd is only possible on hugetlbfs file - * systems on Power, which does not have issue with folio writeback against - * GUP updates. When hugepd will be extended to support non-hugetlbfs or - * even anonymous memory, we need to do extra check as what we do with most - * of the other folios. See writable_file_mapping_allowed() and - * gup_fast_folio_allowed() for more information. - */ -static int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, - unsigned long addr, unsigned int pdshift, - unsigned long end, unsigned int flags, - struct page **pages, int *nr, bool fast) -{ - pte_t *ptep; - unsigned long sz = 1UL << hugepd_shift(hugepd); - unsigned long next; - int ret; - - ptep = hugepte_offset(hugepd, addr, pdshift); - do { - next = hugepte_addr_end(addr, end, sz); - ret = gup_hugepte(vma, ptep, sz, addr, end, flags, pages, nr, - fast); - if (ret != 1) - return ret; - } while (ptep++, addr = next, addr != end); - - return 1; -} - -static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, - unsigned long addr, unsigned int pdshift, - unsigned int flags, - struct follow_page_context *ctx) +/* Common code for can_follow_write_* */ +static inline bool can_follow_write_common(struct page *page, + struct vm_area_struct *vma, unsigned int flags) { - struct page *page; - struct hstate *h; - spinlock_t *ptl; - int nr = 0, ret; - pte_t *ptep; - - /* Only hugetlb supports hugepd */ - if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma))) - return ERR_PTR(-EFAULT); + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; - h = hstate_vma(vma); - ptep = hugepte_offset(hugepd, addr, pdshift); - ptl = huge_pte_lock(h, vma->vm_mm, ptep); - ret = gup_hugepd(vma, hugepd, addr, pdshift, addr + PAGE_SIZE, - flags, &page, &nr, false); - spin_unlock(ptl); + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; - if (ret == 1) { - /* GUP succeeded */ - WARN_ON_ONCE(nr != 1); - ctx->page_mask = (1U << huge_page_order(h)) - 1; - return page; - } + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; - /* ret can be either 0 (translates to NULL) or negative */ - return ERR_PTR(ret); -} -#else /* CONFIG_ARCH_HAS_HUGEPD */ -static inline int gup_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, - unsigned long addr, unsigned int pdshift, - unsigned long end, unsigned int flags, - struct page **pages, int *nr, bool fast) -{ - return 0; -} + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; -static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd, - unsigned long addr, unsigned int pdshift, - unsigned int flags, - struct follow_page_context *ctx) -{ - return NULL; + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + return page && PageAnon(page) && PageAnonExclusive(page); } -#endif /* CONFIG_ARCH_HAS_HUGEPD */ - static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags, unsigned long address) @@ -698,9 +634,21 @@ static struct page *no_page_table(struct vm_area_struct *vma, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */ +static inline bool can_follow_write_pud(pud_t pud, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pud is writable, we can write to the page. */ + if (pud_write(pud)) + return true; + + return can_follow_write_common(page, vma, flags); +} + static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, - int flags, struct follow_page_context *ctx) + int flags, unsigned long *page_mask) { struct mm_struct *mm = vma->vm_mm; struct page *page; @@ -710,45 +658,24 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, assert_spin_locked(pud_lockptr(mm, pudp)); - if ((flags & FOLL_WRITE) && !pud_write(pud)) + if (!pud_present(pud)) return NULL; - if (!pud_present(pud)) + if ((flags & FOLL_WRITE) && + !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags)) return NULL; pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; - - if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && - pud_devmap(pud)) { - /* - * device mapped pages can only be returned if the caller - * will manage the page reference count. - * - * At least one of FOLL_GET | FOLL_PIN must be set, so - * assert that here: - */ - if (!(flags & (FOLL_GET | FOLL_PIN))) - return ERR_PTR(-EEXIST); - - if (flags & FOLL_TOUCH) - touch_pud(vma, addr, pudp, flags & FOLL_WRITE); - - ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap); - if (!ctx->pgmap) - return ERR_PTR(-EFAULT); - } - page = pfn_to_page(pfn); - if (!pud_devmap(pud) && !pud_write(pud) && - gup_must_unshare(vma, flags, page)) + if (!pud_write(pud) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); else - ctx->page_mask = HPAGE_PUD_NR - 1; + *page_mask = HPAGE_PUD_NR - 1; return page; } @@ -762,31 +689,11 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, if (pmd_write(pmd)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ - if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + if (pmd_needs_soft_dirty_wp(vma, pmd)) return false; return !userfaultfd_huge_pmd_wp(vma, pmd); } @@ -794,7 +701,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { struct mm_struct *mm = vma->vm_mm; pmd_t pmdval = *pmd; @@ -818,8 +725,8 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && - !PageAnonExclusive(page), page); + VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); ret = try_grab_folio(page_folio(page), 1, flags); if (ret) @@ -831,7 +738,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - ctx->page_mask = HPAGE_PMD_NR - 1; + *page_mask = HPAGE_PMD_NR - 1; return page; } @@ -839,7 +746,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, #else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, - int flags, struct follow_page_context *ctx) + int flags, unsigned long *page_mask) { return NULL; } @@ -847,7 +754,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { return NULL; } @@ -883,50 +790,25 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page, if (pte_write(pte)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ - if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) + if (pte_needs_soft_dirty_wp(vma, pte)) return false; return !userfaultfd_pte_wp(vma, pte); } static struct page *follow_page_pte(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags, - struct dev_pagemap **pgmap) + unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; + struct folio *folio; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; int ret; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == - (FOLL_PIN | FOLL_GET))) - return ERR_PTR(-EINVAL); - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags, address); @@ -939,8 +821,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, page = vm_normal_page(vma, address, pte); /* - * We only care about anon pages in can_follow_write_pte() and don't - * have to worry about pte_devmap() because they are never anon. + * We only care about anon pages in can_follow_write_pte(). */ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, vma, flags)) { @@ -948,18 +829,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, goto out; } - if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { - /* - * Only return device mapping pages in the FOLL_GET or FOLL_PIN - * case since they are only valid while holding the pgmap - * reference. - */ - *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); - if (*pgmap) - page = pte_page(pte); - else - goto no_page; - } else if (unlikely(!page)) { + if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); @@ -974,17 +844,18 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, goto out; } } + folio = page_folio(page); if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && - !PageAnonExclusive(page), page); + VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ - ret = try_grab_folio(page_folio(page), 1, flags); + ret = try_grab_folio(folio, 1, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; @@ -996,7 +867,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { - ret = arch_make_page_accessible(page); + ret = arch_make_folio_accessible(folio); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); @@ -1005,14 +876,14 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && - !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); + !pte_dirty(pte) && !folio_test_dirty(folio)) + folio_mark_dirty(folio); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use - * mark_page_accessed(). + * folio_mark_accessed(). */ - mark_page_accessed(page); + folio_mark_accessed(folio); } out: pte_unmap_unlock(ptep, ptl); @@ -1027,7 +898,7 @@ no_page: static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { pmd_t *pmd, pmdval; spinlock_t *ptl; @@ -1040,19 +911,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return no_page_table(vma, flags, address); if (!pmd_present(pmdval)) return no_page_table(vma, flags, address); - if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval))))) - return follow_hugepd(vma, __hugepd(pmd_val(pmdval)), - address, PMD_SHIFT, flags, ctx); - if (pmd_devmap(pmdval)) { - ptl = pmd_lock(mm, pmd); - page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); - spin_unlock(ptl); - if (page) - return page; - return no_page_table(vma, flags, address); - } if (likely(!pmd_leaf(pmdval))) - return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + return follow_page_pte(vma, address, pmd, flags); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags, address); @@ -1065,16 +925,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, } if (unlikely(!pmd_leaf(pmdval))) { spin_unlock(ptl); - return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + return follow_page_pte(vma, address, pmd, flags); } if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : - follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + follow_page_pte(vma, address, pmd, flags); } - page = follow_huge_pmd(vma, address, pmd, flags, ctx); + page = follow_huge_pmd(vma, address, pmd, flags, page_mask); spin_unlock(ptl); return page; } @@ -1082,7 +942,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { pud_t *pudp, pud; spinlock_t *ptl; @@ -1090,15 +950,12 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; pudp = pud_offset(p4dp, address); - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (!pud_present(pud)) return no_page_table(vma, flags, address); - if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) - return follow_hugepd(vma, __hugepd(pud_val(pud)), - address, PUD_SHIFT, flags, ctx); if (pud_leaf(pud)) { ptl = pud_lock(mm, pudp); - page = follow_huge_pud(vma, address, pudp, flags, ctx); + page = follow_huge_pud(vma, address, pudp, flags, page_mask); spin_unlock(ptl); if (page) return page; @@ -1107,28 +964,24 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, if (unlikely(pud_bad(pud))) return no_page_table(vma, flags, address); - return follow_pmd_mask(vma, address, pudp, flags, ctx); + return follow_pmd_mask(vma, address, pudp, flags, page_mask); } static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { p4d_t *p4dp, p4d; p4dp = p4d_offset(pgdp, address); - p4d = READ_ONCE(*p4dp); + p4d = p4dp_get(p4dp); BUILD_BUG_ON(p4d_leaf(p4d)); - if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) - return follow_hugepd(vma, __hugepd(p4d_val(p4d)), - address, P4D_SHIFT, flags, ctx); - if (!p4d_present(p4d) || p4d_bad(p4d)) return no_page_table(vma, flags, address); - return follow_pud_mask(vma, address, p4dp, flags, ctx); + return follow_pud_mask(vma, address, p4dp, flags, page_mask); } /** @@ -1136,20 +989,16 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour - * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a - * pointer to output page_mask + * @page_mask: a pointer to output page_mask * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * - * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches - * the device's dev_pagemap metadata to avoid repeating expensive lookups. - * * When getting an anonymous page and the caller has to trigger unsharing * of a shared anonymous page first, -EMLINK is returned. The caller should * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only * relevant with FOLL_PIN and !FOLL_WRITE. * - * On output, the @ctx->page_mask is set according to the size of the page. + * On output, @page_mask is set according to the size of the page. * * Return: the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented @@ -1157,7 +1006,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, */ static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; @@ -1165,44 +1014,19 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, vma_pgtable_walk_begin(vma); - ctx->page_mask = 0; + *page_mask = 0; pgd = pgd_offset(mm, address); - if (unlikely(is_hugepd(__hugepd(pgd_val(*pgd))))) - page = follow_hugepd(vma, __hugepd(pgd_val(*pgd)), - address, PGDIR_SHIFT, flags, ctx); - else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) page = no_page_table(vma, flags, address); else - page = follow_p4d_mask(vma, address, pgd, flags, ctx); + page = follow_p4d_mask(vma, address, pgd, flags, page_mask); vma_pgtable_walk_end(vma); return page; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags) -{ - struct follow_page_context ctx = { NULL }; - struct page *page; - - if (vma_is_secretmem(vma)) - return NULL; - - if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) - return NULL; - - /* - * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect - * to fail on PROT_NONE-mapped pages. - */ - page = follow_page_mask(vma, address, foll_flags, &ctx); - if (ctx.pgmap) - put_dev_pagemap(ctx.pgmap); - return page; -} - static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) @@ -1218,10 +1042,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; - if (address > TASK_SIZE) - pgd = pgd_offset_k(address); - else - pgd = pgd_offset_gate(mm, address); + pgd = pgd_offset(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); @@ -1264,19 +1085,19 @@ unmap: * to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, - unsigned long address, unsigned int *flags, bool unshare, + unsigned long address, unsigned int flags, bool unshare, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; - if (*flags & FOLL_NOFAULT) + if (flags & FOLL_NOFAULT) return -EFAULT; - if (*flags & FOLL_WRITE) + if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; - if (*flags & FOLL_REMOTE) + if (flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (*flags & FOLL_UNLOCKABLE) { + if (flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set @@ -1284,12 +1105,12 @@ static int faultin_page(struct vm_area_struct *vma, * That's because some callers may not be prepared to * handle early exits caused by non-fatal signals. */ - if (*flags & FOLL_INTERRUPTIBLE) + if (flags & FOLL_INTERRUPTIBLE) fault_flags |= FAULT_FLAG_INTERRUPTIBLE; } - if (*flags & FOLL_NOWAIT) + if (flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; - if (*flags & FOLL_TRIED) { + if (flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist @@ -1299,7 +1120,7 @@ static int faultin_page(struct vm_area_struct *vma, if (unshare) { fault_flags |= FAULT_FLAG_UNSHARE; /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ - VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); + VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE); } ret = handle_mm_fault(vma, address, fault_flags, NULL); @@ -1323,7 +1144,7 @@ static int faultin_page(struct vm_area_struct *vma, } if (ret & VM_FAULT_ERROR) { - int err = vm_fault_to_errno(ret, *flags); + int err = vm_fault_to_errno(ret, flags); if (err) return err; @@ -1392,6 +1213,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; + if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma)) + return -EOPNOTSUPP; + if (vma_is_secretmem(vma)) return -EFAULT; @@ -1403,9 +1227,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; - /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ - if (is_vm_hugetlb_page(vma)) - return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could @@ -1537,18 +1358,21 @@ static long __get_user_pages(struct mm_struct *mm, { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; - struct follow_page_context ctx = { NULL }; + unsigned long page_mask = 0; if (!nr_pages) return 0; start = untagged_addr_remote(mm, start); - VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + VM_WARN_ON_ONCE(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + VM_WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET)); do { struct page *page; - unsigned int foll_flags = gup_flags; unsigned int page_increm; /* first iteration or cross vma bound */ @@ -1576,7 +1400,7 @@ static long __get_user_pages(struct mm_struct *mm, pages ? &page : NULL); if (ret) goto out; - ctx.page_mask = 0; + page_mask = 0; goto next_page; } @@ -1599,9 +1423,9 @@ retry: } cond_resched(); - page = follow_page_mask(vma, start, foll_flags, &ctx); + page = follow_page_mask(vma, start, gup_flags, &page_mask); if (!page || PTR_ERR(page) == -EMLINK) { - ret = faultin_page(vma, start, &foll_flags, + ret = faultin_page(vma, start, gup_flags, PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: @@ -1632,7 +1456,7 @@ retry: goto out; } next_page: - page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); if (page_increm > nr_pages) page_increm = nr_pages; @@ -1658,20 +1482,19 @@ next_page: * large folio, this should never fail. */ if (try_grab_folio(folio, page_increm - 1, - foll_flags)) { + gup_flags)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ - gup_put_folio(folio, 1, - foll_flags); + gup_put_folio(folio, 1, gup_flags); ret = -EFAULT; goto out; } } for (j = 0; j < page_increm; j++) { - subpage = nth_page(page, j); + subpage = page + j; pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); @@ -1683,8 +1506,6 @@ next_page: nr_pages -= page_increm; } while (nr_pages); out: - if (ctx.pgmap) - put_dev_pagemap(ctx.pgmap); return i ? i : ret; } @@ -1852,7 +1673,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, mmap_assert_locked(mm); if (flags & FOLL_PIN) - mm_set_has_pinned_flag(&mm->flags); + mm_set_has_pinned_flag(mm); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior @@ -1877,10 +1698,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ - if (!*locked) { - BUG_ON(ret < 0); - BUG_ON(ret >= nr_pages); - } + VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages)); if (ret > 0) { nr_pages -= ret; @@ -1925,7 +1743,6 @@ retry: ret = mmap_read_lock_killable(mm); if (ret) { - BUG_ON(ret > 0); if (!pages_done) pages_done = ret; break; @@ -1936,11 +1753,11 @@ retry: pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ - BUG_ON(ret != 0); + VM_WARN_ON_ONCE(ret != 0); goto retry; } if (ret != 1) { - BUG_ON(ret > 1); + VM_WARN_ON_ONCE(ret > 1); if (!pages_done) pages_done = ret; break; @@ -2002,10 +1819,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, int gup_flags; long ret; - VM_BUG_ON(!PAGE_ALIGNED(start)); - VM_BUG_ON(!PAGE_ALIGNED(end)); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(start)); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(end)); + VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma); + VM_WARN_ON_ONCE_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* @@ -2074,8 +1891,8 @@ long faultin_page_range(struct mm_struct *mm, unsigned long start, int gup_flags; long ret; - VM_BUG_ON(!PAGE_ALIGNED(start)); - VM_BUG_ON(!PAGE_ALIGNED(end)); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(start)); + VM_WARN_ON_ONCE(!PAGE_ALIGNED(end)); mmap_assert_locked(mm); /* @@ -2165,7 +1982,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, { struct vm_area_struct *vma; bool must_unlock = false; - unsigned long vm_flags; + vm_flags_t vm_flags; long i; if (!nr_pages) @@ -2228,28 +2045,22 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, */ size_t fault_in_writeable(char __user *uaddr, size_t size) { - char __user *start = uaddr, *end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; if (unlikely(size == 0)) return 0; if (!user_write_access_begin(uaddr, size)) return size; - if (!PAGE_ALIGNED(uaddr)) { - unsafe_put_user(0, uaddr, out); - uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); - } - end = (char __user *)PAGE_ALIGN((unsigned long)start + size); - if (unlikely(end < start)) - end = NULL; - while (uaddr != end) { - unsafe_put_user(0, uaddr, out); - uaddr += PAGE_SIZE; - } + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + unsafe_put_user(0, (char __user *)cur, out); out: user_write_access_end(); - if (size > uaddr - start) - return size - (uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_writeable); @@ -2303,26 +2114,24 @@ EXPORT_SYMBOL(fault_in_subpage_writeable); */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { - unsigned long start = (unsigned long)uaddr, end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; struct mm_struct *mm = current->mm; bool unlocked = false; if (unlikely(size == 0)) return 0; - end = PAGE_ALIGN(start + size); - if (end < start) - end = 0; mmap_read_lock(mm); - do { - if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + if (fixup_user_fault(mm, cur, FAULT_FLAG_WRITE, &unlocked)) break; - start = (start + PAGE_SIZE) & PAGE_MASK; - } while (start != end); mmap_read_unlock(mm); - if (size > (unsigned long)uaddr - start) - return size - ((unsigned long)uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); @@ -2337,30 +2146,24 @@ EXPORT_SYMBOL(fault_in_safe_writeable); */ size_t fault_in_readable(const char __user *uaddr, size_t size) { - const char __user *start = uaddr, *end; + const unsigned long start = (unsigned long)uaddr; + const unsigned long end = start + size; + unsigned long cur; volatile char c; if (unlikely(size == 0)) return 0; if (!user_read_access_begin(uaddr, size)) return size; - if (!PAGE_ALIGNED(uaddr)) { - unsafe_get_user(c, uaddr, out); - uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); - } - end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); - if (unlikely(end < start)) - end = NULL; - while (uaddr != end) { - unsafe_get_user(c, uaddr, out); - uaddr += PAGE_SIZE; - } + /* Stop once we overflow to 0. */ + for (cur = start; cur && cur < end; cur = PAGE_ALIGN_DOWN(cur + PAGE_SIZE)) + unsafe_get_user(c, (const char __user *)cur, out); out: user_read_access_end(); (void)c; - if (size > uaddr - start) - return size - (uaddr - start); + if (size > cur - start) + return size - (cur - start); return 0; } EXPORT_SYMBOL(fault_in_readable); @@ -2368,6 +2171,7 @@ EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address + * @locked: a pointer to an int denoting whether the mmap sem is held * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). @@ -2380,37 +2184,95 @@ EXPORT_SYMBOL(fault_in_readable); * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) +struct page *get_dump_page(unsigned long addr, int *locked) { struct page *page; - int locked = 0; int ret; - ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ #ifdef CONFIG_MIGRATION + /* - * Returns the number of collected pages. Return value is always >= 0. + * An array of either pages or folios ("pofs"). Although it may seem tempting to + * avoid this complication, by simply interpreting a list of folios as a list of + * pages, that approach won't work in the longer term, because eventually the + * layouts of struct page and struct folio will become completely different. + * Furthermore, this pof approach avoids excessive page_folio() calls. */ -static unsigned long collect_longterm_unpinnable_pages( - struct list_head *movable_page_list, - unsigned long nr_pages, - struct page **pages) +struct pages_or_folios { + union { + struct page **pages; + struct folio **folios; + void **entries; + }; + bool has_folios; + long nr_entries; +}; + +static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i) { - unsigned long i, collected = 0; - struct folio *prev_folio = NULL; - bool drain_allow = true; + if (pofs->has_folios) + return pofs->folios[i]; + return page_folio(pofs->pages[i]); +} - for (i = 0; i < nr_pages; i++) { - struct folio *folio = page_folio(pages[i]); +static void pofs_clear_entry(struct pages_or_folios *pofs, long i) +{ + pofs->entries[i] = NULL; +} - if (folio == prev_folio) - continue; - prev_folio = folio; +static void pofs_unpin(struct pages_or_folios *pofs) +{ + if (pofs->has_folios) + unpin_folios(pofs->folios, pofs->nr_entries); + else + unpin_user_pages(pofs->pages, pofs->nr_entries); +} + +static struct folio *pofs_next_folio(struct folio *folio, + struct pages_or_folios *pofs, long *index_ptr) +{ + long i = *index_ptr + 1; + + if (!pofs->has_folios && folio_test_large(folio)) { + const unsigned long start_pfn = folio_pfn(folio); + const unsigned long end_pfn = start_pfn + folio_nr_pages(folio); + + for (; i < pofs->nr_entries; i++) { + unsigned long pfn = page_to_pfn(pofs->pages[i]); + + /* Is this page part of this folio? */ + if (pfn < start_pfn || pfn >= end_pfn) + break; + } + } + + if (unlikely(i == pofs->nr_entries)) + return NULL; + *index_ptr = i; + + return pofs_get_folio(pofs, i); +} + +/* + * Returns the number of collected folios. Return value is always >= 0. + */ +static unsigned long collect_longterm_unpinnable_folios( + struct list_head *movable_folio_list, + struct pages_or_folios *pofs) +{ + unsigned long collected = 0; + struct folio *folio; + int drained = 0; + long i = 0; + + for (folio = pofs_get_folio(pofs, i); folio; + folio = pofs_next_folio(folio, pofs, &i)) { if (folio_is_longterm_pinnable(folio)) continue; @@ -2421,19 +2283,27 @@ static unsigned long collect_longterm_unpinnable_pages( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(folio, movable_page_list); + folio_isolate_hugetlb(folio, movable_folio_list); continue; } - if (!folio_test_lru(folio) && drain_allow) { + if (drained == 0 && folio_may_be_lru_cached(folio) && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { + lru_add_drain(); + drained = 1; + } + if (drained == 1 && folio_may_be_lru_cached(folio) && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); - drain_allow = false; + drained = 2; } if (!folio_isolate_lru(folio)) continue; - list_add_tail(&folio->lru, movable_page_list); + list_add_tail(&folio->lru, movable_folio_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); @@ -2443,31 +2313,31 @@ static unsigned long collect_longterm_unpinnable_pages( } /* - * Unpins all pages and migrates device coherent pages and movable_page_list. - * Returns -EAGAIN if all pages were successfully migrated or -errno for failure - * (or partial success). + * Unpins all folios and migrates device coherent folios and movable_folio_list. + * Returns -EAGAIN if all folios were successfully migrated or -errno for + * failure (or partial success). */ -static int migrate_longterm_unpinnable_pages( - struct list_head *movable_page_list, - unsigned long nr_pages, - struct page **pages) +static int +migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list, + struct pages_or_folios *pofs) { int ret; unsigned long i; - for (i = 0; i < nr_pages; i++) { - struct folio *folio = page_folio(pages[i]); + for (i = 0; i < pofs->nr_entries; i++) { + struct folio *folio = pofs_get_folio(pofs, i); if (folio_is_device_coherent(folio)) { /* - * Migration will fail if the page is pinned, so convert - * the pin on the source page to a normal reference. + * Migration will fail if the folio is pinned, so + * convert the pin on the source folio to a normal + * reference. */ - pages[i] = NULL; + pofs_clear_entry(pofs, i); folio_get(folio); gup_put_folio(folio, 1, FOLL_PIN); - if (migrate_device_coherent_page(&folio->page)) { + if (migrate_device_coherent_folio(folio)) { ret = -EBUSY; goto err; } @@ -2476,24 +2346,24 @@ static int migrate_longterm_unpinnable_pages( } /* - * We can't migrate pages with unexpected references, so drop + * We can't migrate folios with unexpected references, so drop * the reference obtained by __get_user_pages_locked(). - * Migrating pages have been added to movable_page_list after + * Migrating folios have been added to movable_folio_list after * calling folio_isolate_lru() which takes a reference so the - * page won't be freed if it's migrating. + * folio won't be freed if it's migrating. */ - unpin_user_page(pages[i]); - pages[i] = NULL; + unpin_folio(folio); + pofs_clear_entry(pofs, i); } - if (!list_empty(movable_page_list)) { + if (!list_empty(movable_folio_list)) { struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, .reason = MR_LONGTERM_PIN, }; - if (migrate_pages(movable_page_list, alloc_migration_target, + if (migrate_pages(movable_folio_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_LONGTERM_PIN, NULL)) { ret = -ENOMEM; @@ -2501,48 +2371,78 @@ static int migrate_longterm_unpinnable_pages( } } - putback_movable_pages(movable_page_list); + putback_movable_pages(movable_folio_list); return -EAGAIN; err: - for (i = 0; i < nr_pages; i++) - if (pages[i]) - unpin_user_page(pages[i]); - putback_movable_pages(movable_page_list); + pofs_unpin(pofs); + putback_movable_pages(movable_folio_list); return ret; } +static long +check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs) +{ + LIST_HEAD(movable_folio_list); + unsigned long collected; + + collected = collect_longterm_unpinnable_folios(&movable_folio_list, + pofs); + if (!collected) + return 0; + + return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs); +} + /* - * Check whether all pages are *allowed* to be pinned. Rather confusingly, all - * pages in the range are required to be pinned via FOLL_PIN, before calling - * this routine. + * Check whether all folios are *allowed* to be pinned indefinitely (long term). + * Rather confusingly, all folios in the range are required to be pinned via + * FOLL_PIN, before calling this routine. + * + * Return values: * - * If any pages in the range are not allowed to be pinned, then this routine - * will migrate those pages away, unpin all the pages in the range and return - * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then - * call this routine again. + * 0: if everything is OK and all folios in the range are allowed to be pinned, + * then this routine leaves all folios pinned and returns zero for success. * - * If an error other than -EAGAIN occurs, this indicates a migration failure. - * The caller should give up, and propagate the error back up the call stack. + * -EAGAIN: if any folios in the range are not allowed to be pinned, then this + * routine will migrate those folios away, unpin all the folios in the range. If + * migration of the entire set of folios succeeds, then -EAGAIN is returned. The + * caller should re-pin the entire range with FOLL_PIN and then call this + * routine again. * - * If everything is OK and all pages in the range are allowed to be pinned, then - * this routine leaves all pages pinned and returns zero for success. + * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this + * indicates a migration failure. The caller should give up, and propagate the + * error back up the call stack. The caller does not need to unpin any folios in + * that case, because this routine will do the unpinning. + */ +static long check_and_migrate_movable_folios(unsigned long nr_folios, + struct folio **folios) +{ + struct pages_or_folios pofs = { + .folios = folios, + .has_folios = true, + .nr_entries = nr_folios, + }; + + return check_and_migrate_movable_pages_or_folios(&pofs); +} + +/* + * Return values and behavior are the same as those for + * check_and_migrate_movable_folios(). */ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) { - unsigned long collected; - LIST_HEAD(movable_page_list); + struct pages_or_folios pofs = { + .pages = pages, + .has_folios = false, + .nr_entries = nr_pages, + }; - collected = collect_longterm_unpinnable_pages(&movable_page_list, - nr_pages, pages); - if (!collected) - return 0; - - return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages, - pages); + return check_and_migrate_movable_pages_or_folios(&pofs); } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, @@ -2550,6 +2450,12 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, { return 0; } + +static long check_and_migrate_movable_folios(unsigned long nr_folios, + struct folio **folios) +{ + return 0; +} #endif /* CONFIG_MIGRATION */ /* @@ -2600,7 +2506,7 @@ static bool is_valid_gup_args(struct page **pages, int *locked, * These flags not allowed to be specified externally to the gup * interfaces: * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only - * - FOLL_REMOTE is internal only and used on follow_page() + * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote() * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL */ if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS)) @@ -2804,7 +2710,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * * *) ptes can be read atomically by the architecture. * - * *) access_ok is sufficient to validate userspace address ranges. + * *) valid user addresses are below TASK_MAX_SIZE * * The last two assumptions can be relaxed by the addition of helper functions. * @@ -2886,9 +2792,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) return false; /* Anonymous folios pose no problem. */ - mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; + mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS; if (mapping_flags) - return mapping_flags & PAGE_MAPPING_ANON; + return mapping_flags & FOLIO_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an @@ -2935,8 +2841,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { - struct dev_pagemap *pgmap = NULL; - int nr_start = *nr, ret = 0; + int ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); @@ -2960,19 +2865,11 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; - if (pte_devmap(pte)) { - if (unlikely(flags & FOLL_LONGTERM)) - goto pte_unmap; - - pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); - if (unlikely(!pgmap)) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - goto pte_unmap; - } - } else if (pte_special(pte)) + if (pte_special(pte)) goto pte_unmap; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + /* If it's not marked as special it must have a valid memmap. */ + VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); folio = try_grab_folio_fast(page, 1, flags); @@ -3001,12 +2898,9 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, * see Documentation/core-api/pin_user_pages.rst for * details. */ - if (flags & FOLL_PIN) { - ret = arch_make_page_accessible(page); - if (ret) { - gup_put_folio(folio, 1, flags); - goto pte_unmap; - } + if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) { + gup_put_folio(folio, 1, flags); + goto pte_unmap; } folio_set_referenced(folio); pages[*nr] = page; @@ -3016,8 +2910,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, ret = 1; pte_unmap: - if (pgmap) - put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } @@ -3040,96 +2932,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ -#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, int *nr) -{ - int nr_start = *nr; - struct dev_pagemap *pgmap = NULL; - - do { - struct folio *folio; - struct page *page = pfn_to_page(pfn); - - pgmap = get_dev_pagemap(pfn, pgmap); - if (unlikely(!pgmap)) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - break; - } - - if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - break; - } - - folio = try_grab_folio_fast(page, 1, flags); - if (!folio) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - break; - } - folio_set_referenced(folio); - pages[*nr] = page; - (*nr)++; - pfn++; - } while (addr += PAGE_SIZE, addr != end); - - put_dev_pagemap(pgmap); - return addr == end; -} - -static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - unsigned long fault_pfn; - int nr_start = *nr; - - fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) - return 0; - - if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; - } - return 1; -} - -static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - unsigned long fault_pfn; - int nr_start = *nr; - - fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) - return 0; - - if (unlikely(pud_val(orig) != pud_val(*pudp))) { - gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); - return 0; - } - return 1; -} -#else -static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - BUILD_BUG(); - return 0; -} - -static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - BUILD_BUG(); - return 0; -} -#endif - static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) @@ -3141,15 +2943,11 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; - if (pmd_devmap(orig)) { - if (unlikely(flags & FOLL_LONGTERM)) - return 0; - return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags, - pages, nr); - } + if (pmd_special(orig)) + return 0; - page = pmd_page(orig); - refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); + refs = (end - addr) >> PAGE_SHIFT; + page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); folio = try_grab_folio_fast(page, refs, flags); if (!folio) @@ -3169,7 +2967,10 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } + pages += *nr; *nr += refs; + for (; refs; refs--) + *(pages++) = page++; folio_set_referenced(folio); return 1; } @@ -3185,15 +2986,11 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; - if (pud_devmap(orig)) { - if (unlikely(flags & FOLL_LONGTERM)) - return 0; - return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags, - pages, nr); - } + if (pud_special(orig)) + return 0; - page = pud_page(orig); - refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); + refs = (end - addr) >> PAGE_SHIFT; + page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); folio = try_grab_folio_fast(page, refs, flags); if (!folio) @@ -3214,47 +3011,10 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } + pages += *nr; *nr += refs; - folio_set_referenced(folio); - return 1; -} - -static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, unsigned int flags, struct page **pages, - int *nr) -{ - int refs; - struct page *page; - struct folio *folio; - - if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) - return 0; - - BUILD_BUG_ON(pgd_devmap(orig)); - - page = pgd_page(orig); - refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); - - folio = try_grab_folio_fast(page, refs, flags); - if (!folio) - return 0; - - if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - if (!gup_fast_folio_allowed(folio, flags)) { - gup_put_folio(folio, refs, flags); - return 0; - } - - *nr += refs; + for (; refs; refs--) + *(pages++) = page++; folio_set_referenced(folio); return 1; } @@ -3283,15 +3043,6 @@ static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, pages, nr)) return 0; - } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { - /* - * architecture have different format for hugetlbfs - * pmd format and THP pmd format - */ - if (gup_hugepd(NULL, __hugepd(pmd_val(pmd)), addr, - PMD_SHIFT, next, flags, pages, nr, - true) != 1) - return 0; } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; @@ -3309,7 +3060,7 @@ static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, pudp = pud_offset_lockless(p4dp, p4d, addr); do { - pud_t pud = READ_ONCE(*pudp); + pud_t pud = pudp_get(pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) @@ -3318,11 +3069,6 @@ static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags, pages, nr)) return 0; - } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { - if (gup_hugepd(NULL, __hugepd(pud_val(pud)), addr, - PUD_SHIFT, next, flags, pages, nr, - true) != 1) - return 0; } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; @@ -3340,19 +3086,14 @@ static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { - p4d_t p4d = READ_ONCE(*p4dp); + p4d_t p4d = p4dp_get(p4dp); next = p4d_addr_end(addr, end); if (!p4d_present(p4d)) return 0; BUILD_BUG_ON(p4d_leaf(p4d)); - if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { - if (gup_hugepd(NULL, __hugepd(p4d_val(p4d)), addr, - P4D_SHIFT, next, flags, pages, nr, - true) != 1) - return 0; - } else if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags, - pages, nr)) + if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags, + pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); @@ -3367,22 +3108,14 @@ static void gup_fast_pgd_range(unsigned long addr, unsigned long end, pgdp = pgd_offset(current->mm, addr); do { - pgd_t pgd = READ_ONCE(*pgdp); + pgd_t pgd = pgdp_get(pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; - if (unlikely(pgd_leaf(pgd))) { - if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags, - pages, nr)) - return; - } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { - if (gup_hugepd(NULL, __hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, flags, pages, nr, - true) != 1) - return; - } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, - pages, nr)) + BUILD_BUG_ON(pgd_leaf(pgd)); + if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, + pages, nr)) return; } while (pgdp++, addr = next, addr != end); } @@ -3416,8 +3149,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end, return 0; if (gup_flags & FOLL_PIN) { - seq = raw_read_seqcount(¤t->mm->write_protect_seq); - if (seq & 1) + if (!raw_seqcount_try_begin(¤t->mm->write_protect_seq, seq)) return 0; } @@ -3430,7 +3162,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end, * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock() here as we also want to block IPIs - * that come from THPs splitting. + * that come from callers of tlb_remove_table_sync_one(). */ local_irq_save(flags); gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned); @@ -3466,7 +3198,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, return -EINVAL; if (gup_flags & FOLL_PIN) - mm_set_has_pinned_flag(¤t->mm->flags); + mm_set_has_pinned_flag(current->mm); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); @@ -3477,8 +3209,6 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; - if (unlikely(!access_ok((void __user *)start, len))) - return -EFAULT; nr_pinned = gup_fast(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) @@ -3687,3 +3417,152 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); + +/** + * memfd_pin_folios() - pin folios associated with a memfd + * @memfd: the memfd whose folios are to be pinned + * @start: the first memfd offset + * @end: the last memfd offset (inclusive) + * @folios: array that receives pointers to the folios pinned + * @max_folios: maximum number of entries in @folios + * @offset: the offset into the first folio + * + * Attempt to pin folios associated with a memfd in the contiguous range + * [start, end]. Given that a memfd is either backed by shmem or hugetlb, + * the folios can either be found in the page cache or need to be allocated + * if necessary. Once the folios are located, they are all pinned via + * FOLL_PIN and @offset is populatedwith the offset into the first folio. + * And, eventually, these pinned folios must be released either using + * unpin_folios() or unpin_folio(). + * + * It must be noted that the folios may be pinned for an indefinite amount + * of time. And, in most cases, the duration of time they may stay pinned + * would be controlled by the userspace. This behavior is effectively the + * same as using FOLL_LONGTERM with other GUP APIs. + * + * Returns number of folios pinned, which could be less than @max_folios + * as it depends on the folio sizes that cover the range [start, end]. + * If no folios were pinned, it returns -errno. + */ +long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, + struct folio **folios, unsigned int max_folios, + pgoff_t *offset) +{ + unsigned int flags, nr_folios, nr_found; + unsigned int i, pgshift = PAGE_SHIFT; + pgoff_t start_idx, end_idx; + struct folio *folio = NULL; + struct folio_batch fbatch; + struct hstate *h; + long ret = -EINVAL; + + if (start < 0 || start > end || !max_folios) + return -EINVAL; + + if (!memfd) + return -EINVAL; + + if (!shmem_file(memfd) && !is_file_hugepages(memfd)) + return -EINVAL; + + if (end >= i_size_read(file_inode(memfd))) + return -EINVAL; + + if (is_file_hugepages(memfd)) { + h = hstate_file(memfd); + pgshift = huge_page_shift(h); + } + + flags = memalloc_pin_save(); + do { + nr_folios = 0; + start_idx = start >> pgshift; + end_idx = end >> pgshift; + if (is_file_hugepages(memfd)) { + start_idx <<= huge_page_order(h); + end_idx <<= huge_page_order(h); + } + + folio_batch_init(&fbatch); + while (start_idx <= end_idx && nr_folios < max_folios) { + /* + * In most cases, we should be able to find the folios + * in the page cache. If we cannot find them for some + * reason, we try to allocate them and add them to the + * page cache. + */ + nr_found = filemap_get_folios_contig(memfd->f_mapping, + &start_idx, + end_idx, + &fbatch); + if (folio) { + folio_put(folio); + folio = NULL; + } + + for (i = 0; i < nr_found; i++) { + folio = fbatch.folios[i]; + + if (try_grab_folio(folio, 1, FOLL_PIN)) { + folio_batch_release(&fbatch); + ret = -EINVAL; + goto err; + } + + if (nr_folios == 0) + *offset = offset_in_folio(folio, start); + + folios[nr_folios] = folio; + if (++nr_folios == max_folios) + break; + } + + folio = NULL; + folio_batch_release(&fbatch); + if (!nr_found) { + folio = memfd_alloc_folio(memfd, start_idx); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + if (ret != -EEXIST) + goto err; + folio = NULL; + } + } + } + + ret = check_and_migrate_movable_folios(nr_folios, folios); + } while (ret == -EAGAIN); + + memalloc_pin_restore(flags); + return ret ? ret : nr_folios; +err: + memalloc_pin_restore(flags); + unpin_folios(folios, nr_folios); + + return ret; +} +EXPORT_SYMBOL_GPL(memfd_pin_folios); + +/** + * folio_add_pins() - add pins to an already-pinned folio + * @folio: the folio to add more pins to + * @pins: number of pins to add + * + * Try to add more pins to an already-pinned folio. The semantics + * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot + * be changed. + * + * This function is helpful when having obtained a pin on a large folio + * using memfd_pin_folios(), but wanting to logically unpin parts + * (e.g., individual pages) of the folio later, for example, using + * unpin_user_page_range_dirty_lock(). + * + * This is not the right interface to initially pin a folio. + */ +int folio_add_pins(struct folio *folio, unsigned int pins) +{ + VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio)); + + return try_grab_folio(folio, pins, FOLL_PIN); +} +EXPORT_SYMBOL_GPL(folio_add_pins); |
