diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 635 |
1 files changed, 221 insertions, 414 deletions
diff --git a/mm/memory.c b/mm/memory.c index 2d8c265fc7d6..0ba4f6b71847 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,7 +57,6 @@ #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> -#include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> @@ -125,6 +124,24 @@ int randomize_va_space __read_mostly = 2; #endif +static const struct ctl_table mmu_sysctl_table[] = { + { + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_mm_sysctl(void) +{ + register_sysctl_init("kernel", mmu_sysctl_table); + return 0; +} + +subsys_initcall(init_mm_sysctl); + #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { @@ -278,8 +295,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, p4d_free_tlb(tlb, p4d, start); } -/* - * This function frees user-level page tables of a process. +/** + * free_pgd_range - Unmap and free page tables in the range + * @tlb: the mmu_gather containing pending TLB flush info + * @addr: virtual address start + * @end: virtual address end + * @floor: lowest address boundary + * @ceiling: highest address boundary + * + * This function tears down all user-level page tables in the + * specified virtual address range [@addr..@end). It is part of + * the memory unmap flow. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -349,6 +375,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, { struct unlink_vma_file_batch vb; + tlb_free_vmas(tlb); + do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; @@ -369,32 +397,26 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, vma_start_write(vma); unlink_anon_vmas(vma); - if (is_vm_hugetlb_page(vma)) { - unlink_file_vma(vma); - hugetlb_free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); - } else { - unlink_file_vma_batch_init(&vb); - unlink_file_vma_batch_add(&vb, vma); + unlink_file_vma_batch_init(&vb); + unlink_file_vma_batch_add(&vb, vma); - /* - * Optimization: gather nearby vmas into one call down - */ - while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(next)) { - vma = next; - next = mas_find(mas, ceiling - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; - if (mm_wr_locked) - vma_start_write(vma); - unlink_anon_vmas(vma); - unlink_file_vma_batch_add(&vb, vma); - } - unlink_file_vma_batch_final(&vb); - free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { + vma = next; + next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; + if (mm_wr_locked) + vma_start_write(vma); + unlink_anon_vmas(vma); + unlink_file_vma_batch_add(&vb, vma); } + unlink_file_vma_batch_final(&vb); + + free_pgd_range(tlb, addr, vma->vm_end, + floor, next ? next->vm_start : ceiling); vma = next; } while (vma); } @@ -518,10 +540,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", + pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, + vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL, mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -586,16 +609,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; if (is_zero_pfn(pfn)) return NULL; - if (pte_devmap(pte)) - /* - * NOTE: New users of ZONE_DEVICE will not set pte_devmap() - * and will have refcounts incremented on their struct pages - * when they are inserted into PTEs, thus they are safe to - * return here. Legacy ZONE_DEVICE pages that set pte_devmap() - * do not have refcounts. Example of legacy ZONE_DEVICE is - * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. - */ - return NULL; print_bad_pte(vma, addr, pte, NULL); return NULL; @@ -673,9 +686,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, } } - if (pmd_devmap(pmd)) - return NULL; - if (is_huge_zero_pmd(pmd)) + if (is_huge_zero_pfn(pfn)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) return NULL; @@ -785,7 +796,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { - unsigned long vm_flags = dst_vma->vm_flags; + vm_flags_t vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); pte_t pte = orig_pte; struct folio *folio; @@ -929,7 +940,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); + pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ @@ -973,10 +984,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int max_nr, int *rss, struct folio **prealloc) { + fpb_t flags = FPB_MERGE_WRITE; struct page *page; struct folio *folio; - bool any_writable; - fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); @@ -991,13 +1001,12 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { - if (src_vma->vm_flags & VM_SHARED) - flags |= FPB_IGNORE_DIRTY; - if (!vma_soft_dirty_enabled(src_vma)) - flags |= FPB_IGNORE_SOFT_DIRTY; + if (!(src_vma->vm_flags & VM_SHARED)) + flags |= FPB_RESPECT_DIRTY; + if (vma_soft_dirty_enabled(src_vma)) + flags |= FPB_RESPECT_SOFT_DIRTY; - nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable, NULL, NULL); + nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1011,8 +1020,6 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); rss[mm_counter_file(folio)] += nr; } - if (any_writable) - pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; @@ -1238,8 +1245,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) - || pmd_devmap(*src_pmd)) { + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, @@ -1275,7 +1281,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { + if (pud_trans_huge(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); @@ -1361,7 +1367,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; - unsigned long next, pfn; + unsigned long next; bool is_cow; int ret; @@ -1371,12 +1377,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); - if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { - ret = track_pfn_copy(dst_vma, src_vma, &pfn); - if (ret) - return ret; - } - /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the @@ -1418,8 +1418,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } - if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) - untrack_pfn_copy(dst_vma, pfn); return ret; } @@ -1545,7 +1543,6 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; @@ -1575,9 +1572,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, - NULL, NULL, NULL); - + nr = folio_pte_batch(folio, pte, ptent, max_nr); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, force_break, any_skipped); @@ -1797,9 +1792,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - __split_huge_pmd(vma, pmd, addr, false, NULL); + __split_huge_pmd(vma, pmd, addr, false); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { addr = next; continue; @@ -1839,7 +1834,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_trans_huge(*pud) || pud_devmap(*pud)) { + if (pud_trans_huge(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); @@ -1914,9 +1909,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) uprobe_munmap(vma, start, end); - if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn(vma, 0, 0, mm_wr_locked); - if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* @@ -1990,35 +1982,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, } /** - * zap_page_range_single - remove user pages in a given range + * zap_page_range_single_batched - remove user pages in a given range + * @tlb: pointer to the caller's struct mmu_gather * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap - * @size: number of bytes to zap + * @address: starting address of pages to remove + * @size: number of bytes to remove * @details: details of shared cache invalidation * - * The range must fit into one VMA. + * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for + * hugetlb, @tlb is flushed and re-initialized by this function. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; - struct mmu_gather tlb; + + VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); - tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); + if (is_vm_hugetlb_page(vma)) { + /* + * flush tlb and free resources before hugetlb_zap_end(), to + * avoid concurrent page faults' allocation failure. + */ + tlb_finish_mmu(tlb); + hugetlb_zap_end(vma, details); + tlb_gather_mmu(tlb, vma->vm_mm); + } +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * The range must fit into one VMA. + */ +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, address, size, details); tlb_finish_mmu(&tlb); - hugetlb_zap_end(vma, details); } /** @@ -2418,7 +2439,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, EXPORT_SYMBOL(vm_map_pages_zero); static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t prot, bool mkwrite) + unsigned long pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, entry; @@ -2440,7 +2461,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + if (pte_pfn(entry) != pfn) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } @@ -2453,10 +2474,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, } /* Ok, finally just insert the thing.. */ - if (pfn_t_devmap(pfn)) - entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); - else - entry = pte_mkspecial(pfn_t_pte(pfn, prot)); + entry = pte_mkspecial(pfn_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); @@ -2525,10 +2543,9 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); - return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, - false); + return insert_pfn(vma, addr, pfn, pgprot, false); } EXPORT_SYMBOL(vmf_insert_pfn_prot); @@ -2559,25 +2576,22 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vmf_insert_pfn); -static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite) +static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn, + bool mkwrite) { - if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) && + if (unlikely(is_zero_pfn(pfn)) && (mkwrite || !vm_mixed_zeropage_allowed(vma))) return false; /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; - if (pfn_t_devmap(pfn)) - return true; - if (pfn_t_special(pfn)) - return true; - if (is_zero_pfn(pfn_t_to_pfn(pfn))) + if (is_zero_pfn(pfn)) return true; return false; } static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn, bool mkwrite) + unsigned long addr, unsigned long pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; int err; @@ -2588,9 +2602,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); - if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) + if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; /* @@ -2600,8 +2614,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && - !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) { struct page *page; /* @@ -2609,7 +2622,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ - page = pfn_to_page(pfn_t_to_pfn(pfn)); + page = pfn_to_page(pfn); err = insert_page(vma, addr, page, pgprot, mkwrite); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); @@ -2644,7 +2657,7 @@ vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) + unsigned long pfn) { return __vm_insert_mixed(vma, addr, pfn, false); } @@ -2656,7 +2669,7 @@ EXPORT_SYMBOL(vmf_insert_mixed); * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn) + unsigned long addr, unsigned long pfn) { return __vm_insert_mixed(vma, addr, pfn, true); } @@ -2833,6 +2846,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, return error; } +#ifdef __HAVE_PFNMAP_TRACKING +static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn, + unsigned long size, pgprot_t *prot) +{ + struct pfnmap_track_ctx *ctx; + + if (pfnmap_track(pfn, size, prot)) + return ERR_PTR(-EINVAL); + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (unlikely(!ctx)) { + pfnmap_untrack(pfn, size); + return ERR_PTR(-ENOMEM); + } + + ctx->pfn = pfn; + ctx->size = size; + kref_init(&ctx->kref); + return ctx; +} + +void pfnmap_track_ctx_release(struct kref *ref) +{ + struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref); + + pfnmap_untrack(ctx->pfn, ctx->size); + kfree(ctx); +} +#endif /* __HAVE_PFNMAP_TRACKING */ + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to @@ -2845,20 +2888,51 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, * * Return: %0 on success, negative error code otherwise. */ +#ifdef __HAVE_PFNMAP_TRACKING int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { + struct pfnmap_track_ctx *ctx = NULL; int err; - err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); - if (err) + size = PAGE_ALIGN(size); + + /* + * If we cover the full VMA, we'll perform actual tracking, and + * remember to untrack when the last reference to our tracking + * context from a VMA goes away. We'll keep tracking the whole pfn + * range even during VMA splits and partial unmapping. + * + * If we only cover parts of the VMA, we'll only setup the cachemode + * in the pgprot for the pfn range. + */ + if (addr == vma->vm_start && addr + size == vma->vm_end) { + if (vma->pfnmap_track_ctx) + return -EINVAL; + ctx = pfnmap_track_ctx_alloc(pfn, size, &prot); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + } else if (pfnmap_setup_cachemode(pfn, size, &prot)) { return -EINVAL; + } err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); - if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); + if (ctx) { + if (err) + kref_put(&ctx->kref, pfnmap_track_ctx_release); + else + vma->pfnmap_track_ctx = ctx; + } return err; } + +#else +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_notrack(vma, addr, pfn, size, prot); +} +#endif EXPORT_SYMBOL(remap_pfn_range); /** @@ -2938,11 +3012,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { if (create || !pte_none(ptep_get(pte))) { - err = fn(pte++, addr, data); + err = fn(pte, addr, data); if (err) break; } - } while (addr += PAGE_SIZE, addr != end); + } while (pte++, addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; @@ -3523,7 +3597,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); - entry = mk_pte(&new_folio->page, vma->vm_page_prot); + entry = folio_mk_pte(new_folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) @@ -3730,12 +3804,10 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, * If all folio references are from mappings, and all mappings are in * the page tables of this MM, then this folio is exclusive to this MM. */ - if (folio_test_large_maybe_mapped_shared(folio)) + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) return false; VM_WARN_ON_ONCE(folio_test_ksm(folio)); - VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio)); - VM_WARN_ON_ONCE(folio_entire_mapcount(folio)); if (unlikely(folio_test_swapcache(folio))) { /* @@ -3753,13 +3825,15 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, /* Stabilize the mapcount vs. refcount and recheck. */ folio_lock_large_mapcount(folio); - VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio)); + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio); - if (folio_test_large_maybe_mapped_shared(folio)) + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) goto unlock; if (folio_large_mapcount(folio) != folio_ref_count(folio)) goto unlock; + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio); VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && folio_mm_id(folio, 1) != vma->vm_mm->mm_id); @@ -4224,26 +4298,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - struct swap_info_struct *si = swp_swap_info(entry); - pgoff_t offset = swp_offset(entry); - int i; - - /* - * While allocating a large folio and doing swap_read_folio, which is - * the case the being faulted pte doesn't have swapcache. We need to - * ensure all PTEs have no cache as well, otherwise, we might go to - * swap devices while the content is in swapcache. - */ - for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) - return i; - } - - return i; -} - /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. @@ -4579,8 +4633,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* * KSM sometimes has to copy on read faults, for example, if - * page->index of !PageKSM() pages would be nonlinear inside the - * anon VMA -- PageKSM() is lost on actual swapout. + * folio->index of non-ksm folios would be nonlinear inside the + * anon VMA -- the ksm flag is lost on actual swapout. */ folio = ksm_might_need_to_copy(folio, vma, vmf->address); if (unlikely(!folio)) { @@ -5013,7 +5067,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = mk_pte(&folio->page, vma->vm_page_prot); + entry = folio_mk_pte(folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); @@ -5138,9 +5192,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { - struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -5188,7 +5241,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) flush_icache_pages(vma, page, HPAGE_PMD_NR); - entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -5213,7 +5266,7 @@ out: return ret; } #else -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { return VM_FAULT_FALLBACK; } @@ -5245,6 +5298,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_write(entry) && folio_test_dirty(folio)) + entry = pte_mkdirty(entry); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ @@ -5305,6 +5360,7 @@ fallback: else page = vmf->page; + folio = page_folio(page); /* * check even for read faults because we might have lost our CoWed * page @@ -5316,8 +5372,8 @@ fallback: } if (pmd_none(*vmf->pmd)) { - if (PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); + if (folio_test_pmd_mappable(folio)) { + ret = do_set_pmd(vmf, folio, page); if (ret != VM_FAULT_FALLBACK) return ret; } @@ -5328,15 +5384,14 @@ fallback: return VM_FAULT_OOM; } - folio = page_folio(page); nr_pages = folio_nr_pages(folio); /* * Using per-page fault to maintain the uffd semantics, and same - * approach also applies to non-anonymous-shmem faults to avoid + * approach also applies to non shmem/tmpfs faults to avoid * inflating the RSS of the process. */ - if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) || + if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { @@ -5892,7 +5947,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) split: /* COW or write-notify handled on pte level: split pmd. */ - __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false); return VM_FAULT_FALLBACK; } @@ -6056,7 +6111,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; @@ -6080,7 +6135,7 @@ retry_pud: pud_t orig_pud = *vmf.pud; barrier(); - if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + if (pud_trans_huge(orig_pud)) { /* * TODO once we support anonymous PUDs: NUMA case and @@ -6121,7 +6176,7 @@ retry_pud: pmd_migration_entry_wait(mm, vmf.pmd); return 0; } - if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { + if (pmd_trans_huge(vmf.orig_pmd)) { if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf); @@ -6338,258 +6393,6 @@ out: } EXPORT_SYMBOL_GPL(handle_mm_fault); -#ifdef CONFIG_LOCK_MM_AND_FIND_VMA -#include <linux/extable.h> - -static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - if (likely(mmap_read_trylock(mm))) - return true; - - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - - return !mmap_read_lock_killable(mm); -} - -static inline bool mmap_upgrade_trylock(struct mm_struct *mm) -{ - /* - * We don't have this operation yet. - * - * It should be easy enough to do: it's basically a - * atomic_long_try_cmpxchg_acquire() - * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but - * it also needs the proper lockdep magic etc. - */ - return false; -} - -static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - mmap_read_unlock(mm); - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - return !mmap_write_lock_killable(mm); -} - -/* - * Helper for page fault handling. - * - * This is kind of equivalent to "mmap_read_lock()" followed - * by "find_extend_vma()", except it's a lot more careful about - * the locking (and will drop the lock on failure). - * - * For example, if we have a kernel bug that causes a page - * fault, we don't want to just use mmap_read_lock() to get - * the mm lock, because that would deadlock if the bug were - * to happen while we're holding the mm lock for writing. - * - * So this checks the exception tables on kernel faults in - * order to only do this all for instructions that are actually - * expected to fault. - * - * We can also actually take the mm lock for writing if we - * need to extend the vma, which helps the VM layer a lot. - */ -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, - unsigned long addr, struct pt_regs *regs) -{ - struct vm_area_struct *vma; - - if (!get_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (likely(vma && (vma->vm_start <= addr))) - return vma; - - /* - * Well, dang. We might still be successful, but only - * if we can extend a vma to do so. - */ - if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { - mmap_read_unlock(mm); - return NULL; - } - - /* - * We can try to upgrade the mmap lock atomically, - * in which case we can continue to use the vma - * we already looked up. - * - * Otherwise we'll have to drop the mmap lock and - * re-take it, and also look up the vma again, - * re-checking it. - */ - if (!mmap_upgrade_trylock(mm)) { - if (!upgrade_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (!vma) - goto fail; - if (vma->vm_start <= addr) - goto success; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto fail; - } - - if (expand_stack_locked(vma, addr)) - goto fail; - -success: - mmap_write_downgrade(mm); - return vma; - -fail: - mmap_write_unlock(mm); - return NULL; -} -#endif - -#ifdef CONFIG_PER_VMA_LOCK -static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) -{ - unsigned int tgt_refcnt = VMA_LOCK_OFFSET; - - /* Additional refcnt if the vma is attached. */ - if (!detaching) - tgt_refcnt++; - - /* - * If vma is detached then only vma_mark_attached() can raise the - * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). - */ - if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) - return false; - - rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); - rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, - refcount_read(&vma->vm_refcnt) == tgt_refcnt, - TASK_UNINTERRUPTIBLE); - lock_acquired(&vma->vmlock_dep_map, _RET_IP_); - - return true; -} - -static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) -{ - *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); - rwsem_release(&vma->vmlock_dep_map, _RET_IP_); -} - -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) -{ - bool locked; - - /* - * __vma_enter_locked() returns false immediately if the vma is not - * attached, otherwise it waits until refcnt is indicating that vma - * is attached with no readers. - */ - locked = __vma_enter_locked(vma, false); - - /* - * We should use WRITE_ONCE() here because we can have concurrent reads - * from the early lockless pessimistic check in vma_start_read(). - * We don't really care about the correctness of that early check, but - * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. - */ - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - - if (locked) { - bool detached; - - __vma_exit_locked(vma, &detached); - WARN_ON_ONCE(detached); /* vma should remain attached */ - } -} -EXPORT_SYMBOL_GPL(__vma_start_write); - -void vma_mark_detached(struct vm_area_struct *vma) -{ - vma_assert_write_locked(vma); - vma_assert_attached(vma); - - /* - * We are the only writer, so no need to use vma_refcount_put(). - * The condition below is unlikely because the vma has been already - * write-locked and readers can increment vm_refcnt only temporarily - * before they check vm_lock_seq, realize the vma is locked and drop - * back the vm_refcnt. That is a narrow window for observing a raised - * vm_refcnt. - */ - if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { - /* Wait until vma is detached with no readers. */ - if (__vma_enter_locked(vma, true)) { - bool detached; - - __vma_exit_locked(vma, &detached); - WARN_ON_ONCE(!detached); - } - } -} - -/* - * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be - * stable and not isolated. If the VMA is not found or is being modified the - * function returns NULL. - */ -struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, - unsigned long address) -{ - MA_STATE(mas, &mm->mm_mt, address, address); - struct vm_area_struct *vma; - - rcu_read_lock(); -retry: - vma = mas_walk(&mas); - if (!vma) - goto inval; - - vma = vma_start_read(mm, vma); - if (IS_ERR_OR_NULL(vma)) { - /* Check if the VMA got isolated after we found it */ - if (PTR_ERR(vma) == -EAGAIN) { - count_vm_vma_lock_event(VMA_LOCK_MISS); - /* The area was replaced with another one */ - goto retry; - } - - /* Failed to lock the VMA */ - goto inval; - } - /* - * At this point, we have a stable reference to a VMA: The VMA is - * locked and we know it hasn't already been isolated. - * From here on, we can access the VMA without worrying about which - * fields are accessible for RCU readers. - */ - - /* Check if the vma we locked is the right one. */ - if (unlikely(vma->vm_mm != mm || - address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; - - rcu_read_unlock(); - return vma; - -inval_end_read: - vma_end_read(vma); -inval: - rcu_read_unlock(); - count_vm_vma_lock_event(VMA_LOCK_ABORT); - return NULL; -} -#endif /* CONFIG_PER_VMA_LOCK */ - #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. @@ -6900,6 +6703,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, while (len) { int bytes, offset; void *maddr; + struct folio *folio; struct vm_area_struct *vma = NULL; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); @@ -6931,21 +6735,22 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, if (bytes <= 0) break; } else { + folio = page_folio(page); bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; - maddr = kmap_local_page(page); + maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); - set_page_dirty_lock(page); + folio_mark_dirty_lock(folio); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); } len -= bytes; buf += bytes; @@ -7024,6 +6829,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, while (len) { int bytes, offset, retval; void *maddr; + struct folio *folio; struct page *page; struct vm_area_struct *vma = NULL; @@ -7039,17 +6845,18 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, goto out; } + folio = page_folio(page); bytes = len; offset = addr & (PAGE_SIZE - 1); if (bytes > PAGE_SIZE - offset) bytes = PAGE_SIZE - offset; - maddr = kmap_local_page(page); + maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE); retval = strscpy(buf, maddr + offset, bytes); if (retval >= 0) { /* Found the end of the string */ buf += retval; - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); break; } @@ -7067,7 +6874,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, } len -= bytes; - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); } out: |