diff options
Diffstat (limited to 'mm/memory.c')
| -rw-r--r-- | mm/memory.c | 1969 |
1 files changed, 1197 insertions, 772 deletions
diff --git a/mm/memory.c b/mm/memory.c index 75c2dfd04f72..2a55edc48a65 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,15 +57,15 @@ #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> -#include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/elf.h> #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> +#include <linux/shmem_fs.h> #include <linux/memory-tiers.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> @@ -76,13 +76,13 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> +#include <linux/pgalloc.h> +#include <linux/uaccess.h> #include <trace/events/kmem.h> #include <asm/io.h> #include <asm/mmu_context.h> -#include <asm/pgalloc.h> -#include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> @@ -94,14 +94,6 @@ #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif -#ifndef CONFIG_NUMA -unsigned long max_mapnr; -EXPORT_SYMBOL(max_mapnr); - -struct page *mem_map; -EXPORT_SYMBOL(mem_map); -#endif - static vm_fault_t do_fault(struct vm_fault *vmf); static vm_fault_t do_anonymous_page(struct vm_fault *vmf); static bool vmf_pte_changed(struct vm_fault *vmf); @@ -117,18 +109,10 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; - return pte_marker_uffd_wp(vmf->orig_pte); + return pte_is_uffd_wp_marker(vmf->orig_pte); } /* - * A number of key systems in x86 including ioremap() rely on the assumption - * that high_memory defines the upper bound on direct map memory, then end - * of ZONE_NORMAL. - */ -void *high_memory; -EXPORT_SYMBOL(high_memory); - -/* * Randomize the address space (stacks, mmaps, brk, etc.). * * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, @@ -141,6 +125,24 @@ int randomize_va_space __read_mostly = 2; #endif +static const struct ctl_table mmu_sysctl_table[] = { + { + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_mm_sysctl(void) +{ + register_sysctl_init("kernel", mmu_sysctl_table); + return 0; +} + +subsys_initcall(init_mm_sysctl); + #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { @@ -294,8 +296,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, p4d_free_tlb(tlb, p4d, start); } -/* - * This function frees user-level page tables of a process. +/** + * free_pgd_range - Unmap and free page tables in the range + * @tlb: the mmu_gather containing pending TLB flush info + * @addr: virtual address start + * @end: virtual address end + * @floor: lowest address boundary + * @ceiling: highest address boundary + * + * This function tears down all user-level page tables in the + * specified virtual address range [@addr..@end). It is part of + * the memory unmap flow. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -365,6 +376,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, { struct unlink_vma_file_batch vb; + tlb_free_vmas(tlb); + do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; @@ -385,32 +398,26 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, vma_start_write(vma); unlink_anon_vmas(vma); - if (is_vm_hugetlb_page(vma)) { - unlink_file_vma(vma); - hugetlb_free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); - } else { - unlink_file_vma_batch_init(&vb); - unlink_file_vma_batch_add(&vb, vma); + unlink_file_vma_batch_init(&vb); + unlink_file_vma_batch_add(&vb, vma); - /* - * Optimization: gather nearby vmas into one call down - */ - while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(next)) { - vma = next; - next = mas_find(mas, ceiling - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; - if (mm_wr_locked) - vma_start_write(vma); - unlink_anon_vmas(vma); - unlink_file_vma_batch_add(&vb, vma); - } - unlink_file_vma_batch_final(&vb); - free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { + vma = next; + next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; + if (mm_wr_locked) + vma_start_write(vma); + unlink_anon_vmas(vma); + unlink_file_vma_batch_add(&vb, vma); } + unlink_file_vma_batch_final(&vb); + + free_pgd_range(tlb, addr, vma->vm_end, + floor, next ? next->vm_start : ceiling); vma = next; } while (vma); } @@ -485,22 +492,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) add_mm_counter(mm, i, rss[i]); } -/* - * This function is called to print an error when a bad pte - * is found. For example, we might have a PFN-mapped pte in - * a region that doesn't allow it. - * - * The calling function must still handle the error. - */ -static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, struct page *page) +static bool is_bad_page_map_ratelimited(void) { - pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - p4d_t *p4d = p4d_offset(pgd, addr); - pud_t *pud = pud_offset(p4d, addr); - pmd_t *pmd = pmd_offset(pud, addr); - struct address_space *mapping; - pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; @@ -512,7 +505,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; - return; + return true; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", @@ -523,37 +516,135 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; + return false; +} + +static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr) +{ + unsigned long long pgdv, p4dv, pudv, pmdv; + p4d_t p4d, *p4dp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pgd_t *pgdp; + + /* + * Although this looks like a fully lockless pgtable walk, it is not: + * see locking requirements for print_bad_page_map(). + */ + pgdp = pgd_offset(mm, addr); + pgdv = pgd_val(*pgdp); + + if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) { + pr_alert("pgd:%08llx\n", pgdv); + return; + } + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + p4dv = p4d_val(p4d); + + if (!p4d_present(p4d) || p4d_leaf(p4d)) { + pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv); + return; + } + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + pudv = pud_val(pud); + + if (!pud_present(pud) || pud_leaf(pud)) { + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv); + return; + } + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + pmdv = pmd_val(pmd); + + /* + * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE, + * because the table should already be mapped by the caller and + * doing another map would be bad. print_bad_page_map() should + * already take care of printing the PTE. + */ + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv, + p4dv, pudv, pmdv); +} + +/* + * This function is called to print an error when a bad page table entry (e.g., + * corrupted page table entry) is found. For example, we might have a + * PFN-mapped pte in a region that doesn't allow it. + * + * The calling function must still handle the error. + * + * This function must be called during a proper page table walk, as it will + * re-walk the page table to dump information: the caller MUST prevent page + * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf + * page table lock. + */ +static void print_bad_page_map(struct vm_area_struct *vma, + unsigned long addr, unsigned long long entry, struct page *page, + enum pgtable_level level) +{ + struct address_space *mapping; + pgoff_t index; + + if (is_bad_page_map_ratelimited()) + return; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm, + pgtable_level_to_str(level), entry); + __print_bad_page_map_pgtable(vma->vm_mm, addr); if (page) - dump_page(page, "bad pte"); + dump_page(page, "bad page map"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", + pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, + vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL, mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +#define print_bad_pte(vma, addr, pte, page) \ + print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE) -/* - * vm_normal_page -- This function gets the "struct page" associated with a pte. +/** + * __vm_normal_page() - Get the "struct page" associated with a page table entry. + * @vma: The VMA mapping the page table entry. + * @addr: The address where the page table entry is mapped. + * @pfn: The PFN stored in the page table entry. + * @special: Whether the page table entry is marked "special". + * @level: The page table level for error reporting purposes only. + * @entry: The page table entry value for error reporting purposes only. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this - * case, NULL is returned here. "Normal" mappings do have a struct page. + * case, NULL is returned here. "Normal" mappings do have a struct page and + * are ordinarily refcounted. + * + * Page mappings of the shared zero folios are always considered "special", as + * they are not ordinarily refcounted: neither the refcount nor the mapcount + * of these folios is adjusted when mapping them into user page tables. + * Selected page table walkers (such as GUP) can still identify mappings of the + * shared zero folios and work with the underlying "struct page". + * + * There are 2 broad cases. Firstly, an architecture may define a "special" + * page table entry bit, such as pte_special(), in which case this function is + * trivial. Secondly, an architecture may not have a spare page table + * entry bit, which requires a more complicated scheme, described below. * - * There are 2 broad cases. Firstly, an architecture may define a pte_special() - * pte bit, in which case this function is trivial. Secondly, an architecture - * may not have a spare pte bit, which requires a more complicated scheme, - * described below. + * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on + * page table entries that actually map "normal" pages: however, that page + * cannot be looked up through the PFN stored in the page table entry, but + * instead will be looked up through vm_ops->find_normal_page(). So far, this + * only applies to PTEs. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). @@ -578,82 +669,104 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct - * page (that is, those where pfn_valid is true) are refcounted and considered - * normal pages by the VM. The only exception are zeropages, which are - * *never* refcounted. + * page (that is, those where pfn_valid is true, except the shared zero + * folios) are refcounted and considered normal pages by the VM. * * The disadvantage is that pages are refcounted (which can be slower and * simply not an option for some PFNMAP users). The advantage is that we * don't have to follow the strict linearity rule of PFNMAP mappings in * order to support COWable mappings. * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. */ -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +static inline struct page *__vm_normal_page(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, bool special, + unsigned long long entry, enum pgtable_level level) { - unsigned long pfn = pte_pfn(pte); - if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { - if (likely(!pte_special(pte))) - goto check_pfn; - if (vma->vm_ops && vma->vm_ops->find_special_page) - return vma->vm_ops->find_special_page(vma, addr); - if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return NULL; - if (is_zero_pfn(pfn)) + if (unlikely(special)) { +#ifdef CONFIG_FIND_NORMAL_PAGE + if (vma->vm_ops && vma->vm_ops->find_normal_page) + return vma->vm_ops->find_normal_page(vma, addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + return NULL; + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) + return NULL; + + print_bad_page_map(vma, addr, entry, NULL, level); return NULL; - if (pte_devmap(pte)) + } /* - * NOTE: New users of ZONE_DEVICE will not set pte_devmap() - * and will have refcounts incremented on their struct pages - * when they are inserted into PTEs, thus they are safe to - * return here. Legacy ZONE_DEVICE pages that set pte_devmap() - * do not have refcounts. Example of legacy ZONE_DEVICE is - * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. + * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table + * mappings (incl. shared zero folios) are marked accordingly. */ - return NULL; - - print_bad_pte(vma, addr, pte, NULL); - return NULL; - } - - /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ + } else { + if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + /* If it has a "struct page", it's "normal". */ + if (!pfn_valid(pfn)) + return NULL; + } else { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - if (is_zero_pfn(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; + /* Only CoW'ed anon folios are "normal". */ + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } } - } - if (is_zero_pfn(pfn)) - return NULL; + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) + return NULL; + } -check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { - print_bad_pte(vma, addr, pte, NULL); + /* Corrupted page table entry. */ + print_bad_page_map(vma, addr, entry, NULL, level); return NULL; } - /* * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. + * For example, VDSO mappings can cause them to exist. */ -out: - VM_WARN_ON_ONCE(is_zero_pfn(pfn)); + VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)); return pfn_to_page(pfn); } +/** + * vm_normal_page() - Get the "struct page" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte), + pte_val(pte), PGTABLE_LEVEL_PTE); +} + +/** + * vm_normal_folio() - Get the "struct folio" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { @@ -665,45 +778,37 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/** + * vm_normal_page_pmd() - Get the "struct page" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { - unsigned long pfn = pmd_pfn(pmd); - - /* Currently it's only used for huge pfnmaps */ - if (unlikely(pmd_special(pmd))) - return NULL; - - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; - } - } - - if (pmd_devmap(pmd)) - return NULL; - if (is_huge_zero_pmd(pmd)) - return NULL; - if (unlikely(pfn > highest_memmap_pfn)) - return NULL; - - /* - * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. - */ -out: - return pfn_to_page(pfn); + return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd), + pmd_val(pmd), PGTABLE_LEVEL_PMD); } +/** + * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { @@ -713,44 +818,74 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, return page_folio(page); return NULL; } + +/** + * vm_normal_page_pud() - Get the "struct page" associated with a PUD + * @vma: The VMA mapping the @pud. + * @addr: The address where the @pud is mapped. + * @pud: The PUD. + * + * Get the "struct page" associated with a PUD. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t pud) +{ + return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud), + pud_val(pud), PGTABLE_LEVEL_PUD); +} #endif +/** + * restore_exclusive_pte - Restore a device-exclusive entry + * @vma: VMA covering @address + * @folio: the mapped folio + * @page: the mapped folio page + * @address: the virtual address + * @ptep: pte pointer into the locked page table mapping the folio page + * @orig_pte: pte value at @ptep + * + * Restore a device-exclusive non-swap entry to an ordinary present pte. + * + * The folio and the page table must be locked, and MMU notifiers must have + * been called to invalidate any (exclusive) device mappings. + * + * Locking the folio makes sure that anybody who just converted the pte to + * a device-exclusive entry can map it into the device to make forward + * progress without others converting it back until the folio was unlocked. + * + * If the folio lock ever becomes an issue, we can stop relying on the folio + * lock; it might make some scenarios with heavy thrashing less likely to + * make forward progress, but these scenarios might not be valid use cases. + * + * Note that the folio lock does not protect against all cases of concurrent + * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers + * must use MMU notifiers to sync against any concurrent changes. + */ static void restore_exclusive_pte(struct vm_area_struct *vma, - struct page *page, unsigned long address, - pte_t *ptep) + struct folio *folio, struct page *page, unsigned long address, + pte_t *ptep, pte_t orig_pte) { - struct folio *folio = page_folio(page); - pte_t orig_pte; pte_t pte; - swp_entry_t entry; - orig_pte = ptep_get(ptep); + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(orig_pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); - else if (is_writable_device_exclusive_entry(entry)) - pte = maybe_mkwrite(pte_mkdirty(pte), vma); - - VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) && - PageAnonExclusive(page)), folio); - - /* - * No need to take a page reference as one was already - * created when the swap entry was made. - */ - if (folio_test_anon(folio)) - folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE); - else - /* - * Currently device exclusive access only supports anonymous - * memory so the entry shouldn't point to a filebacked page. - */ - WARN_ON_ONCE(1); + if ((vma->vm_flags & VM_WRITE) && + can_change_pte_writable(vma, address, pte)) { + if (folio_test_dirty(folio)) + pte = pte_mkdirty(pte); + pte = pte_mkwrite(pte, vma); + } set_pte_at(vma->vm_mm, address, ptep, pte); /* @@ -764,16 +899,16 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, * Tries to restore an exclusive pte if the page lock can be acquired without * sleeping. */ -static int -try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr) +static int try_restore_exclusive_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, pte_t orig_pte) { - swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); - struct page *page = pfn_swap_entry_to_page(entry); + const softleaf_t entry = softleaf_from_pte(orig_pte); + struct page *page = softleaf_to_page(entry); + struct folio *folio = page_folio(page); - if (trylock_page(page)) { - restore_exclusive_pte(vma, page, addr, src_pte); - unlock_page(page); + if (folio_trylock(folio)) { + restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte); + folio_unlock(folio); return 0; } @@ -791,14 +926,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { - unsigned long vm_flags = dst_vma->vm_flags; + vm_flags_t vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); + softleaf_t entry = softleaf_from_pte(orig_pte); pte_t pte = orig_pte; struct folio *folio; struct page *page; - swp_entry_t entry = pte_to_swp_entry(orig_pte); - if (likely(!non_swap_entry(entry))) { + if (likely(softleaf_is_swap(entry))) { if (swap_duplicate(entry) < 0) return -EIO; @@ -816,12 +951,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); + } else if (softleaf_is_migration(entry)) { + folio = softleaf_to_folio(entry); rss[mm_counter(folio)]++; - if (!is_readable_migration_entry(entry) && + if (!softleaf_is_migration_read(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent and child @@ -830,15 +965,15 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ entry = make_readable_migration_entry( swp_offset(entry)); - pte = swp_entry_to_pte(entry); + pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } - } else if (is_device_private_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + } else if (softleaf_is_device_private(entry)) { + page = softleaf_to_page(entry); folio = page_folio(page); /* @@ -853,7 +988,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, folio_get(folio); rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ - folio_try_dup_anon_rmap_pte(folio, page, src_vma); + folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma); /* * We do not preserve soft-dirty information, because so @@ -862,7 +997,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * when a device driver is involved (you cannot easily * save and restore device driver state). */ - if (is_writable_device_private_entry(entry) && + if (softleaf_is_device_private_write(entry) && is_cow_mapping(vm_flags)) { entry = make_readable_device_private_entry( swp_offset(entry)); @@ -871,7 +1006,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } - } else if (is_device_exclusive_entry(entry)) { + } else if (softleaf_is_device_exclusive(entry)) { /* * Make device exclusive entries present by restoring the * original entry then copying as for a present pte. Device @@ -879,10 +1014,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * (ie. COW) mappings. */ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); - if (try_restore_exclusive_pte(src_pte, src_vma, addr)) + if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte)) return -EBUSY; return -ENOENT; - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { pte_marker marker = copy_pte_marker(entry, dst_vma); if (marker) @@ -935,7 +1070,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); + pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ @@ -979,10 +1114,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int max_nr, int *rss, struct folio **prealloc) { + fpb_t flags = FPB_MERGE_WRITE; struct page *page; struct folio *folio; - bool any_writable; - fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); @@ -997,28 +1131,25 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { - if (src_vma->vm_flags & VM_SHARED) - flags |= FPB_IGNORE_DIRTY; - if (!vma_soft_dirty_enabled(src_vma)) - flags |= FPB_IGNORE_SOFT_DIRTY; + if (!(src_vma->vm_flags & VM_SHARED)) + flags |= FPB_RESPECT_DIRTY; + if (vma_soft_dirty_enabled(src_vma)) + flags |= FPB_RESPECT_SOFT_DIRTY; - nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable, NULL, NULL); + nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, - nr, src_vma))) { + nr, dst_vma, src_vma))) { folio_ref_sub(folio, nr); return -EAGAIN; } rss[MM_ANONPAGES] += nr; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { - folio_dup_file_rmap_ptes(folio, page, nr); + folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); rss[mm_counter_file(folio)] += nr; } - if (any_writable) - pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; @@ -1032,7 +1163,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * guarantee the pinned page won't be randomly replaced in the * future. */ - if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) { + if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, @@ -1042,7 +1173,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma rss[MM_ANONPAGES]++; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { - folio_dup_file_rmap_pte(folio, page); + folio_dup_file_rmap_pte(folio, page, dst_vma); rss[mm_counter_file(folio)]++; } @@ -1087,7 +1218,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; - swp_entry_t entry = (swp_entry_t){0}; + softleaf_t entry = softleaf_mk_none(); struct folio *prealloc = NULL; int nr; @@ -1151,7 +1282,7 @@ again: dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(ptep_get(src_pte)); + entry = softleaf_from_pte(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -1244,9 +1375,9 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) - || pmd_devmap(*src_pmd)) { + if (pmd_is_huge(*src_pmd)) { int err; + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma); @@ -1281,7 +1412,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { + if (pud_trans_huge(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); @@ -1334,18 +1465,12 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { + if (src_vma->vm_flags & VM_COPY_ON_FORK) + return true; /* - * Always copy pgtables when dst_vma has uffd-wp enabled even if it's - * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable - * contains uffd-wp protection information, that's something we can't - * retrieve from page cache, and skip copying will lose those info. + * The presence of an anon_vma indicates an anonymous VMA has page + * tables which naturally cannot be reconstituted on page fault. */ - if (userfaultfd_wp(dst_vma)) - return true; - - if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return true; - if (src_vma->anon_vma) return true; @@ -1362,12 +1487,12 @@ int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; - unsigned long next; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; + unsigned long next; bool is_cow; int ret; @@ -1377,16 +1502,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); - if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { - /* - * We do not free on error cases below as remove_vma - * gets called on error from higher level routine - */ - ret = track_pfn_copy(src_vma); - if (ret) - return ret; - } - /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the @@ -1419,7 +1534,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { - untrack_pfn_clear(dst_vma); ret = -ENOMEM; break; } @@ -1436,7 +1550,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details) + if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ @@ -1466,34 +1580,44 @@ static inline bool zap_drop_markers(struct zap_details *details) /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. + * + * Returns true if uffd-wp ptes was installed, false otherwise. */ -static inline void +static inline bool zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { + bool was_installed = false; + + if (!uffd_supports_wp_marker()) + return false; + /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) - return; + return false; if (zap_drop_markers(details)) - return; + return false; for (;;) { /* the PFN in the PTE is irrelevant. */ - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval)) + was_installed = true; if (--nr == 0) break; pte++; addr += PAGE_SIZE; } + + return was_installed; } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, - bool *force_flush, bool *force_break) + bool *force_flush, bool *force_break, bool *any_skipped) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; @@ -1519,8 +1643,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, - ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, + nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); @@ -1544,9 +1668,8 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, - bool *force_break) + bool *force_break, bool *any_skipped) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; @@ -1559,34 +1682,141 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, - details, ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, + pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) + if (unlikely(!should_zap_folio(details, folio))) { + *any_skipped = true; return 1; + } /* * Make sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, - NULL, NULL, NULL); - + nr = folio_pte_batch(folio, pte, ptent, max_nr); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, - force_break); + force_break, any_skipped); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, - details, rss, force_flush, force_break); + details, rss, force_flush, force_break, any_skipped); return 1; } +static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *any_skipped) +{ + softleaf_t entry; + int nr = 1; + + *any_skipped = true; + entry = softleaf_from_pte(ptent); + if (softleaf_is_device_private(entry) || + softleaf_is_device_exclusive(entry)) { + struct page *page = softleaf_to_page(entry); + struct folio *folio = page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) + return 1; + /* + * Both device private/exclusive mappings should only + * work with anonymous page so far, so we don't need to + * consider uffd-wp bit when zap. For more information, + * see zap_install_uffd_wp_if_needed(). + */ + WARN_ON_ONCE(!vma_is_anonymous(vma)); + rss[mm_counter(folio)]--; + folio_remove_rmap_pte(folio, page, vma); + folio_put(folio); + } else if (softleaf_is_swap(entry)) { + /* Genuine swap entries, hence a private anon pages */ + if (!should_zap_cows(details)) + return 1; + + nr = swap_pte_batch(pte, max_nr, ptent); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); + } else if (softleaf_is_migration(entry)) { + struct folio *folio = softleaf_to_folio(entry); + + if (!should_zap_folio(details, folio)) + return 1; + rss[mm_counter(folio)]--; + } else if (softleaf_is_uffd_wp_marker(entry)) { + /* + * For anon: always drop the marker; for file: only + * drop the marker if explicitly requested. + */ + if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) + return 1; + } else if (softleaf_is_guard_marker(entry)) { + /* + * Ordinary zapping should not remove guard PTE + * markers. Only do so if we should remove PTE markers + * in general. + */ + if (!zap_drop_markers(details)) + return 1; + } else if (softleaf_is_hwpoison(entry) || + softleaf_is_poison_marker(entry)) { + if (!should_zap_cows(details)) + return 1; + } else { + /* We should have covered all the swap entry types */ + pr_alert("unrecognized swap entry 0x%lx\n", entry.val); + WARN_ON_ONCE(1); + } + clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); + + return nr; +} + +static inline int do_zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, unsigned long end, + struct zap_details *details, int *rss, + bool *force_flush, bool *force_break, + bool *any_skipped) +{ + pte_t ptent = ptep_get(pte); + int max_nr = (end - addr) / PAGE_SIZE; + int nr = 0; + + /* Skip all consecutive none ptes */ + if (pte_none(ptent)) { + for (nr = 1; nr < max_nr; nr++) { + ptent = ptep_get(pte + nr); + if (!pte_none(ptent)) + break; + } + max_nr -= nr; + if (!max_nr) + return nr; + pte += nr; + addr += nr * PAGE_SIZE; + } + + if (pte_present(ptent)) + nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, force_flush, force_break, + any_skipped); + else + nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, any_skipped); + + return nr; +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1598,9 +1828,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - swp_entry_t entry; + pmd_t pmdval; + unsigned long start = addr; + bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); + bool direct_reclaim = true; int nr; +retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1610,90 +1844,35 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = ptep_get(pte); - struct folio *folio; - struct page *page; - int max_nr; + bool any_skipped = false; - nr = 1; - if (pte_none(ptent)) - continue; - - if (need_resched()) + if (need_resched()) { + direct_reclaim = false; break; - - if (pte_present(ptent)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, - addr, details, rss, &force_flush, - &force_break); - if (unlikely(force_break)) { - addr += nr * PAGE_SIZE; - break; - } - continue; } - entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry) || - is_device_exclusive_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) - continue; - /* - * Both device private/exclusive mappings should only - * work with anonymous page so far, so we don't need to - * consider uffd-wp bit when zap. For more information, - * see zap_install_uffd_wp_if_needed(). - */ - WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(folio)]--; - if (is_device_private_entry(entry)) - folio_remove_rmap_pte(folio, page, vma); - folio_put(folio); - } else if (!non_swap_entry(entry)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = swap_pte_batch(pte, max_nr, ptent); - /* Genuine swap entries, hence a private anon pages */ - if (!should_zap_cows(details)) - continue; - rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); - if (!should_zap_folio(details, folio)) - continue; - rss[mm_counter(folio)]--; - } else if (pte_marker_entry_uffd_wp(entry)) { - /* - * For anon: always drop the marker; for file: only - * drop the marker if explicitly requested. - */ - if (!vma_is_anonymous(vma) && - !zap_drop_markers(details)) - continue; - } else if (is_guard_swp_entry(entry)) { - /* - * Ordinary zapping should not remove guard PTE - * markers. Only do so if we should remove PTE markers - * in general. - */ - if (!zap_drop_markers(details)) - continue; - } else if (is_hwpoison_entry(entry) || - is_poisoned_swp_entry(entry)) { - if (!should_zap_cows(details)) - continue; - } else { - /* We should have covered all the swap entry types */ - pr_alert("unrecognized swap entry 0x%lx\n", entry.val); - WARN_ON_ONCE(1); + nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, + &force_flush, &force_break, &any_skipped); + if (any_skipped) + can_reclaim_pt = false; + if (unlikely(force_break)) { + addr += nr * PAGE_SIZE; + direct_reclaim = false; + break; } - clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); + /* + * Fast path: try to hold the pmd lock and unmap the PTE page. + * + * If the pte lock was released midway (retry case), or if the attempt + * to hold the pmd lock failed, then we need to recheck all pte entries + * to ensure they are still none, thereby preventing the pte entries + * from being repopulated by another thread. + */ + if (can_reclaim_pt && direct_reclaim && addr == end) + direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1713,6 +1892,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (force_flush) tlb_flush_mmu(tlb); + if (addr != end) { + cond_resched(); + force_flush = false; + force_break = false; + goto retry; + } + + if (can_reclaim_pt) { + if (direct_reclaim) + free_pte(mm, start, tlb, pmdval); + else + try_to_free_pte(mm, pmd, start, tlb); + } + return addr; } @@ -1727,9 +1920,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (pmd_is_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - __split_huge_pmd(vma, pmd, addr, false, NULL); + __split_huge_pmd(vma, pmd, addr, false); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { addr = next; continue; @@ -1769,7 +1962,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_trans_huge(*pud) || pud_devmap(*pud)) { + if (pud_trans_huge(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); @@ -1829,8 +2022,7 @@ void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, - struct zap_details *details, bool mm_wr_locked) + unsigned long end_addr, struct zap_details *details) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; @@ -1844,9 +2036,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) uprobe_munmap(vma, start, end); - if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn(vma, 0, 0, mm_wr_locked); - if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* @@ -1879,7 +2068,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @tree_end: The maximum index to check - * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * @@ -1894,8 +2082,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end) { struct mmu_notifier_range range; struct zap_details details = { @@ -1911,8 +2098,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unsigned long start = start_addr; unsigned long end = end_addr; hugetlb_zap_begin(vma, &start, &end); - unmap_single_vma(tlb, vma, start, end, &details, - mm_wr_locked); + unmap_single_vma(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); @@ -1920,36 +2106,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, } /** - * zap_page_range_single - remove user pages in a given range + * zap_page_range_single_batched - remove user pages in a given range + * @tlb: pointer to the caller's struct mmu_gather * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap - * @size: number of bytes to zap + * @address: starting address of pages to remove + * @size: number of bytes to remove * @details: details of shared cache invalidation * - * The range must fit into one VMA. + * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for + * hugetlb, @tlb is flushed and re-initialized by this function. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; - struct mmu_gather tlb; - lru_add_drain(); + VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); - tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(&range); + if (is_vm_hugetlb_page(vma)) { + /* + * flush tlb and free resources before hugetlb_zap_end(), to + * avoid concurrent page faults' allocation failure. + */ + tlb_finish_mmu(tlb); + hugetlb_zap_end(vma, details); + tlb_gather_mmu(tlb, vma->vm_mm); + } +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * The range must fit into one VMA. + */ +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, address, size, details); tlb_finish_mmu(&tlb); - hugetlb_zap_end(vma, details); } /** @@ -2048,27 +2262,46 @@ static int validate_page_before_insert(struct vm_area_struct *vma, return -EINVAL; return 0; } - if (folio_test_anon(folio) || folio_test_slab(folio) || - page_has_type(page)) + if (folio_test_anon(folio) || page_has_type(page)) return -EINVAL; flush_dcache_folio(folio); return 0; } static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, - unsigned long addr, struct page *page, pgprot_t prot) + unsigned long addr, struct page *page, + pgprot_t prot, bool mkwrite) { struct folio *folio = page_folio(page); - pte_t pteval; + pte_t pteval = ptep_get(pte); + + if (!pte_none(pteval)) { + if (!mkwrite) + return -EBUSY; + + /* see insert_pfn(). */ + if (pte_pfn(pteval) != page_to_pfn(page)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval))); + return -EFAULT; + } + pteval = maybe_mkwrite(pteval, vma); + pteval = pte_mkyoung(pteval); + if (ptep_set_access_flags(vma, addr, pte, pteval, 1)) + update_mmu_cache(vma, addr, pte); + return 0; + } - if (!pte_none(ptep_get(pte))) - return -EBUSY; /* Ok, finally just insert the thing.. */ pteval = mk_pte(page, prot); if (unlikely(is_zero_folio(folio))) { pteval = pte_mkspecial(pteval); } else { folio_get(folio); + pteval = mk_pte(page, prot); + if (mkwrite) { + pteval = pte_mkyoung(pteval); + pteval = maybe_mkwrite(pte_mkdirty(pteval), vma); + } inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); folio_add_file_rmap_pte(folio, page, vma); } @@ -2077,7 +2310,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, } static int insert_page(struct vm_area_struct *vma, unsigned long addr, - struct page *page, pgprot_t prot) + struct page *page, pgprot_t prot, bool mkwrite) { int retval; pte_t *pte; @@ -2090,7 +2323,8 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; - retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot, + mkwrite); pte_unmap_unlock(pte, ptl); out: return retval; @@ -2104,7 +2338,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, err = validate_page_before_insert(vma, page); if (err) return err; - return insert_page_into_pte_locked(vma, pte, addr, page, prot); + return insert_page_into_pte_locked(vma, pte, addr, page, prot, false); } /* insert_pages() amortizes the cost of spinlock operations @@ -2240,7 +2474,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } - return insert_page(vma, addr, page, vma->vm_page_prot); + return insert_page(vma, addr, page, vma->vm_page_prot, false); } EXPORT_SYMBOL(vm_insert_page); @@ -2328,7 +2562,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, EXPORT_SYMBOL(vm_map_pages_zero); static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t prot, bool mkwrite) + unsigned long pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, entry; @@ -2350,7 +2584,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + if (pte_pfn(entry) != pfn) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } @@ -2363,10 +2597,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, } /* Ok, finally just insert the thing.. */ - if (pfn_t_devmap(pfn)) - entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); - else - entry = pte_mkspecial(pfn_t_pte(pfn, prot)); + entry = pte_mkspecial(pfn_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); @@ -2435,10 +2666,9 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); - return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, - false); + return insert_pfn(vma, addr, pfn, pgprot, false); } EXPORT_SYMBOL(vmf_insert_pfn_prot); @@ -2469,25 +2699,22 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vmf_insert_pfn); -static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite) +static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn, + bool mkwrite) { - if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) && + if (unlikely(is_zero_pfn(pfn)) && (mkwrite || !vm_mixed_zeropage_allowed(vma))) return false; /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; - if (pfn_t_devmap(pfn)) - return true; - if (pfn_t_special(pfn)) - return true; - if (is_zero_pfn(pfn_t_to_pfn(pfn))) + if (is_zero_pfn(pfn)) return true; return false; } static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn, bool mkwrite) + unsigned long addr, unsigned long pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; int err; @@ -2498,9 +2725,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); - if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) + if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; /* @@ -2510,8 +2737,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && - !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) { struct page *page; /* @@ -2519,8 +2745,8 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ - page = pfn_to_page(pfn_t_to_pfn(pfn)); - err = insert_page(vma, addr, page, pgprot); + page = pfn_to_page(pfn); + err = insert_page(vma, addr, page, pgprot, mkwrite); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } @@ -2533,8 +2759,28 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, + bool write) +{ + pgprot_t pgprot = vmf->vma->vm_page_prot; + unsigned long addr = vmf->address; + int err; + + if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + + err = insert_page(vmf->vma, addr, page, pgprot, write); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite); + vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) + unsigned long pfn) { return __vm_insert_mixed(vma, addr, pfn, false); } @@ -2546,7 +2792,7 @@ EXPORT_SYMBOL(vmf_insert_mixed); * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn) + unsigned long addr, unsigned long pfn) { return __vm_insert_mixed(vma, addr, pfn, true); } @@ -2649,6 +2895,25 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } +static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, + unsigned long end, unsigned long vm_start, unsigned long vm_end, + unsigned long pfn, pgoff_t *vm_pgoff_p) +{ + /* + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. + */ + if (is_cow_mapping(vm_flags)) { + if (addr != vm_start || end != vm_end) + return -EINVAL; + *vm_pgoff_p = pfn; + } + + return 0; +} + static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -2661,31 +2926,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - /* - * Physically remapped pages are special. Tell the - * rest of the world about it: - * VM_IO tells people not to look at these pages - * (accesses can have side effects). - * VM_PFNMAP tells the core MM that the base pages are just - * raw PFN mappings, and do not have a "struct page" associated - * with them. - * VM_DONTEXPAND - * Disable vma merging and expanding with mremap(). - * VM_DONTDUMP - * Omit vma from core dump, even when VM_IO turned off. - * - * There's a horrible special case to handle copy-on-write - * behaviour that some programs depend on. We mark the "original" - * un-COW'ed pages by matching them up with "vma->vm_pgoff". - * See vm_normal_page() for details. - */ - if (is_cow_mapping(vma->vm_flags)) { - if (addr != vma->vm_start || end != vma->vm_end) - return -EINVAL; - vma->vm_pgoff = pfn; - } - - vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -2706,7 +2947,7 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad * Variant of remap_pfn_range that does not call track_pfn_remap. The caller * must have pre-validated the caching bits of the pgprot_t. */ -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); @@ -2723,6 +2964,113 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, return error; } +#ifdef __HAVE_PFNMAP_TRACKING +static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn, + unsigned long size, pgprot_t *prot) +{ + struct pfnmap_track_ctx *ctx; + + if (pfnmap_track(pfn, size, prot)) + return ERR_PTR(-EINVAL); + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (unlikely(!ctx)) { + pfnmap_untrack(pfn, size); + return ERR_PTR(-ENOMEM); + } + + ctx->pfn = pfn; + ctx->size = size; + kref_init(&ctx->kref); + return ctx; +} + +void pfnmap_track_ctx_release(struct kref *ref) +{ + struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref); + + pfnmap_untrack(ctx->pfn, ctx->size); + kfree(ctx); +} + +static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + struct pfnmap_track_ctx *ctx = NULL; + int err; + + size = PAGE_ALIGN(size); + + /* + * If we cover the full VMA, we'll perform actual tracking, and + * remember to untrack when the last reference to our tracking + * context from a VMA goes away. We'll keep tracking the whole pfn + * range even during VMA splits and partial unmapping. + * + * If we only cover parts of the VMA, we'll only setup the cachemode + * in the pgprot for the pfn range. + */ + if (addr == vma->vm_start && addr + size == vma->vm_end) { + if (vma->pfnmap_track_ctx) + return -EINVAL; + ctx = pfnmap_track_ctx_alloc(pfn, size, &prot); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + } else if (pfnmap_setup_cachemode(pfn, size, &prot)) { + return -EINVAL; + } + + err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); + if (ctx) { + if (err) + kref_put(&ctx->kref, pfnmap_track_ctx_release); + else + vma->pfnmap_track_ctx = ctx; + } + return err; +} + +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_track(vma, addr, pfn, size, prot); +} +#else +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_notrack(vma, addr, pfn, size, prot); +} +#endif + +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ + /* + * We set addr=VMA start, end=VMA end here, so this won't fail, but we + * check it again on complete and will fail there if specified addr is + * invalid. + */ + get_remap_pgoff(desc->vm_flags, desc->start, desc->end, + desc->start, desc->end, pfn, &desc->pgoff); + desc->vm_flags |= VM_REMAP_FLAGS; +} + +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size) +{ + unsigned long end = addr + PAGE_ALIGN(size); + int err; + + err = get_remap_pgoff(vma->vm_flags, addr, end, + vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); + if (err) + return err; + + vm_flags_set(vma, VM_REMAP_FLAGS); + return 0; +} + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to @@ -2740,17 +3088,20 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, { int err; - err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + err = remap_pfn_range_prepare_vma(vma, addr, pfn, size); if (err) - return -EINVAL; + return err; - err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); - if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); - return err; + return do_remap_pfn_range(vma, addr, pfn, size, prot); } EXPORT_SYMBOL(remap_pfn_range); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to @@ -2828,11 +3179,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { if (create || !pte_none(ptep_get(pte))) { - err = fn(pte++, addr, data); + err = fn(pte, addr, data); if (err) break; } - } while (addr += PAGE_SIZE, addr != end); + } while (pte++, addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; @@ -2971,8 +3322,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) continue; - if (WARN_ON_ONCE(pgd_leaf(*pgd))) - return -EINVAL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) { + err = -EINVAL; + break; + } if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { if (!create) continue; @@ -3013,7 +3366,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, { return __apply_to_page_range(mm, addr, size, fn, data, false); } -EXPORT_SYMBOL_GPL(apply_to_existing_page_range); /* * handle_pte_fault chooses page fault handler according to an entry which was @@ -3412,7 +3764,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); - entry = mk_pte(&new_folio->page, vma->vm_page_prot); + entry = folio_mk_pte(new_folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) @@ -3596,19 +3948,86 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) return ret; } -static bool wp_can_reuse_anon_folio(struct folio *folio, - struct vm_area_struct *vma) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static bool __wp_can_reuse_large_anon_folio(struct folio *folio, + struct vm_area_struct *vma) { + bool exclusive = false; + + /* Let's just free up a large folio if only a single page is mapped. */ + if (folio_large_mapcount(folio) <= 1) + return false; + /* - * We could currently only reuse a subpage of a large folio if no - * other subpages of the large folios are still mapped. However, - * let's just consistently not reuse subpages even if we could - * reuse in that scenario, and give back a large folio a bit - * sooner. + * The assumption for anonymous folios is that each page can only get + * mapped once into each MM. The only exception are KSM folios, which + * are always small. + * + * Each taken mapcount must be paired with exactly one taken reference, + * whereby the refcount must be incremented before the mapcount when + * mapping a page, and the refcount must be decremented after the + * mapcount when unmapping a page. + * + * If all folio references are from mappings, and all mappings are in + * the page tables of this MM, then this folio is exclusive to this MM. */ - if (folio_test_large(folio)) + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) + return false; + + VM_WARN_ON_ONCE(folio_test_ksm(folio)); + + if (unlikely(folio_test_swapcache(folio))) { + /* + * Note: freeing up the swapcache will fail if some PTEs are + * still swap entries. + */ + if (!folio_trylock(folio)) + return false; + folio_free_swap(folio); + folio_unlock(folio); + } + + if (folio_large_mapcount(folio) != folio_ref_count(folio)) return false; + /* Stabilize the mapcount vs. refcount and recheck. */ + folio_lock_large_mapcount(folio); + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio); + + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) + goto unlock; + if (folio_large_mapcount(folio) != folio_ref_count(folio)) + goto unlock; + + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio); + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && + folio_mm_id(folio, 1) != vma->vm_mm->mm_id); + + /* + * Do we need the folio lock? Likely not. If there would have been + * references from page migration/swapout, we would have detected + * an additional folio reference and never ended up here. + */ + exclusive = true; +unlock: + folio_unlock_large_mapcount(folio); + return exclusive; +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static bool __wp_can_reuse_large_anon_folio(struct folio *folio, + struct vm_area_struct *vma) +{ + BUILD_BUG(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static bool wp_can_reuse_anon_folio(struct folio *folio, + struct vm_area_struct *vma) +{ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio)) + return __wp_can_reuse_large_anon_folio(folio, vma); + /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing @@ -3717,13 +4136,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a - * VM_PFNMAP VMA. + * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ - if (!vmf->page) + if (!vmf->page || is_fsdax_page(vmf->page)) { + vmf->page = NULL; return wp_pfn_shared(vmf); + } return wp_page_shared(vmf, folio); } @@ -3913,7 +4334,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) folio_put(folio); return ret; } - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); @@ -3921,7 +4342,8 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) - restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); + restore_exclusive_pte(vma, folio, vmf->page, vmf->address, + vmf->pte, vmf->orig_pte); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -3945,7 +4367,7 @@ static inline bool should_try_to_free_swap(struct folio *folio, * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * user. Try freeing the swapcache to get rid of the swapcache - * reference only in case it's likely that we'll be the exlusive user. + * reference only in case it's likely that we'll be the exclusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && folio_ref_count(folio) == (1 + folio_nr_pages(folio)); @@ -3963,7 +4385,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * * This should also cover the case where e.g. the pte changed * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. - * So is_pte_marker() check is not enough to safely drop the pte. + * So pte_is_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); @@ -3997,8 +4419,8 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { - swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); - unsigned long marker = pte_marker_get(entry); + const softleaf_t entry = softleaf_from_pte(vmf->orig_pte); + const pte_marker marker = softleaf_to_marker(entry); /* * PTE markers should never be empty. If anything weird happened, @@ -4015,7 +4437,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) if (marker & PTE_MARKER_GUARD) return VM_FAULT_SIGSEGV; - if (pte_marker_entry_uffd_wp(entry)) + if (softleaf_is_uffd_wp_marker(entry)) return pte_marker_handle_uffd_wp(vmf); /* This is an unknown pte marker */ @@ -4026,13 +4448,13 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio; - swp_entry_t entry; + softleaf_t entry; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; - entry = pte_to_swp_entry(vmf->orig_pte); + entry = softleaf_from_pte(vmf->orig_pte); if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { folio_put(folio); @@ -4043,26 +4465,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - struct swap_info_struct *si = swp_swap_info(entry); - pgoff_t offset = swp_offset(entry); - int i; - - /* - * While allocating a large folio and doing swap_read_folio, which is - * the case the being faulted pte doesn't have swapcache. We need to - * ensure all PTEs have no cache as well, otherwise, we might go to - * swap devices while the content is in swapcache. - */ - for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) - return i; - } - - return i; -} - /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. @@ -4070,7 +4472,7 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) { unsigned long addr; - swp_entry_t entry; + softleaf_t entry; int idx; pte_t pte; @@ -4080,7 +4482,7 @@ static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) return false; - entry = pte_to_swp_entry(pte); + entry = softleaf_from_pte(pte); if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) return false; @@ -4126,7 +4528,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) unsigned long orders; struct folio *folio; unsigned long addr; - swp_entry_t entry; + softleaf_t entry; spinlock_t *ptl; pte_t *pte; gfp_t gfp; @@ -4147,13 +4549,13 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!zswap_never_enabled()) goto fallback; - entry = pte_to_swp_entry(vmf->orig_pte); + entry = softleaf_from_pte(vmf->orig_pte); /* * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); @@ -4189,8 +4591,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); folio_put(folio); } + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); } @@ -4224,7 +4628,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) rmap_t rmap_flags = RMAP_NONE; bool need_clear_cache = false; bool exclusive = false; - swp_entry_t entry; + softleaf_t entry; pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; @@ -4236,15 +4640,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!pte_unmap_same(vmf)) goto out; - entry = pte_to_swp_entry(vmf->orig_pte); - if (unlikely(non_swap_entry(entry))) { - if (is_migration_entry(entry)) { + entry = softleaf_from_pte(vmf->orig_pte); + if (unlikely(!softleaf_is_swap(entry))) { + if (softleaf_is_migration(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); - } else if (is_device_exclusive_entry(entry)) { - vmf->page = pfn_swap_entry_to_page(entry); + } else if (softleaf_is_device_exclusive(entry)) { + vmf->page = softleaf_to_page(entry); ret = remove_device_exclusive_entry(vmf); - } else if (is_device_private_entry(entry)) { + } else if (softleaf_is_device_private(entry)) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) { /* * migrate_to_ram is not yet ready to operate @@ -4255,7 +4659,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; } - vmf->page = pfn_swap_entry_to_page(entry); + vmf->page = softleaf_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || @@ -4267,13 +4671,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * Get a page reference while we know the page can't be * freed. */ - get_page(vmf->page); - pte_unmap_unlock(vmf->pte, vmf->ptl); - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); - put_page(vmf->page); - } else if (is_hwpoison_entry(entry)) { + if (trylock_page(vmf->page)) { + struct dev_pagemap *pgmap; + + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + pgmap = page_pgmap(vmf->page); + ret = pgmap->ops->migrate_to_ram(vmf); + unlock_page(vmf->page); + put_page(vmf->page); + } else { + pte_unmap_unlock(vmf->pte, vmf->ptl); + } + } else if (softleaf_is_hwpoison(entry)) { ret = VM_FAULT_HWPOISON; - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); @@ -4287,9 +4699,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - folio = swap_cache_get_folio(entry, vma, vmf->address); + folio = swap_cache_get_folio(entry); if (folio) - page = folio_file_page(folio, swp_offset(entry)); + swap_update_readahead(folio, vma, vmf->address); swapcache = folio; if (!folio) { @@ -4324,9 +4736,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } need_clear_cache = true; - mem_cgroup_swapin_uncharge_swap(entry, nr_pages); + memcg1_swapin(entry, nr_pages); - shadow = get_shadow_from_swap_cache(entry); + shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(folio, shadow); @@ -4360,20 +4772,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); - page = folio_file_page(folio, swp_offset(entry)); - } else if (PageHWPoison(page)) { - /* - * hwpoisoned dirty swapcache pages are kept for killing - * owner processes (which may be unknown at hwpoison time) - */ - ret = VM_FAULT_HWPOISON; - goto out_release; } ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; + page = folio_file_page(folio, swp_offset(entry)); if (swapcache) { /* * Make sure folio_free_swap() or swapoff did not release the @@ -4382,14 +4787,22 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache, we need to check that the page's swap has not * changed. */ - if (unlikely(!folio_test_swapcache(folio) || - page_swap_entry(page).val != entry.val)) + if (unlikely(!folio_matches_swap_entry(folio, entry))) goto out_page; + if (unlikely(PageHWPoison(page))) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ + ret = VM_FAULT_HWPOISON; + goto out_page; + } + /* * KSM sometimes has to copy on read faults, for example, if - * page->index of !PageKSM() pages would be nonlinear inside the - * anon VMA -- PageKSM() is lost on actual swapout. + * folio->index of non-ksm folios would be nonlinear inside the + * anon VMA -- the ksm flag is lost on actual swapout. */ folio = ksm_might_need_to_copy(folio, vma, vmf->address); if (unlikely(!folio)) { @@ -4690,8 +5103,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -4733,12 +5146,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation - * (__GFP_ZERO not used), folio_zero_user() is used - * to make sure that the page corresponding to the - * faulting address will be hot in the cache after - * zeroing. + * (__GFP_ZERO not used) or user folios require special + * handling, folio_zero_user() is used to make sure + * that the page corresponding to the faulting address + * will be hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, vmf->address); return folio; } @@ -4822,7 +5235,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = mk_pte(&folio->page, vma->vm_page_prot); + entry = folio_mk_pte(folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); @@ -4947,9 +5360,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { - struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -4960,9 +5372,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any - * PMD mappings if THPs are disabled. + * PMD mappings if THPs are disabled. As we already have a THP, + * behave as if we are forcing a collapse. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + /* forced_collapse=*/ true)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) @@ -4997,7 +5411,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) flush_icache_pages(vma, page, HPAGE_PMD_NR); - entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -5022,7 +5436,7 @@ out: return ret; } #else -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { return VM_FAULT_FALLBACK; } @@ -5030,7 +5444,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) /** * set_pte_range - Set a range of PTEs to point to pages in a folio. - * @vmf: Fault decription. + * @vmf: Fault description. * @folio: The folio that contains @page. * @page: The first page to create a PTE for. * @nr: The number of PTEs to create. @@ -5054,6 +5468,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_write(entry) && folio_test_dirty(folio)) + entry = pte_mkdirty(entry); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ @@ -5102,7 +5518,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf) bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); int type, nr_pages; - unsigned long addr = vmf->address; + unsigned long addr; + bool needs_fallback = false; + +fallback: + addr = vmf->address; /* Did we COW the page? */ if (is_cow) @@ -5110,6 +5530,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) else page = vmf->page; + folio = page_folio(page); /* * check even for read faults because we might have lost our CoWed * page @@ -5120,9 +5541,26 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } + if (!needs_fallback && vma->vm_file) { + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t file_end; + + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + + /* + * Do not allow to map with PTEs beyond i_size and with PMD + * across i_size to preserve SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + needs_fallback = !shmem_mapping(mapping) && + file_end < folio_next_index(folio); + } + if (pmd_none(*vmf->pmd)) { - if (PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); + if (!needs_fallback && folio_test_pmd_mappable(folio)) { + ret = do_set_pmd(vmf, folio, page); if (ret != VM_FAULT_FALLBACK) return ret; } @@ -5133,15 +5571,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return VM_FAULT_OOM; } - folio = page_folio(page); nr_pages = folio_nr_pages(folio); - /* - * Using per-page fault to maintain the uffd semantics, and same - * approach also applies to non-anonymous-shmem faults to avoid - * inflating the RSS of the process. - */ - if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) { + /* Using per-page fault to maintain the uffd semantics */ + if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); @@ -5177,9 +5610,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; goto unlock; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { - update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); - ret = VM_FAULT_NOPAGE; - goto unlock; + needs_fallback = true; + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto fallback; } folio_ref_add(folio, nr_pages - 1); @@ -5488,7 +5921,7 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) + if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) *flags |= TNF_SHARED; /* * For memory tiering mode, cpupid of slow memory page is used @@ -5625,7 +6058,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) ignore_writable = true; /* Migrate to the requested node */ - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); @@ -5696,7 +6129,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) split: /* COW or write-notify handled on pte level: split pmd. */ - __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false); return VM_FAULT_FALLBACK; } @@ -5740,6 +6173,45 @@ split: } /* + * The page faults may be spurious because of the racy access to the + * page table. For example, a non-populated virtual page is accessed + * on 2 CPUs simultaneously, thus the page faults are triggered on + * both CPUs. However, it's possible that one CPU (say CPU A) cannot + * find the reason for the page fault if the other CPU (say CPU B) has + * changed the page table before the PTE is checked on CPU A. Most of + * the time, the spurious page faults can be ignored safely. However, + * if the page fault is for the write access, it's possible that a + * stale read-only TLB entry exists in the local CPU and needs to be + * flushed on some architectures. This is called the spurious page + * fault fixing. + * + * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page() + * by default and used as such on most architectures, while + * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and + * used as such on most architectures. + */ +static void fix_spurious_fault(struct vm_fault *vmf, + enum pgtable_level ptlevel) +{ + /* Skip spurious TLB flush for retried page fault */ + if (vmf->flags & FAULT_FLAG_TRIED) + return; + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (vmf->flags & FAULT_FLAG_WRITE) { + if (ptlevel == PGTABLE_LEVEL_PTE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, + vmf->pte); + else + flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address, + vmf->pmd); + } +} +/* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. @@ -5820,23 +6292,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, - vmf->flags & FAULT_FLAG_WRITE)) { + vmf->flags & FAULT_FLAG_WRITE)) update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte, 1); - } else { - /* Skip spurious TLB flush for retried page fault */ - if (vmf->flags & FAULT_FLAG_TRIED) - goto unlock; - /* - * This is needed only for protection faults but the arch code - * is not yet telling us if this is a protection fault or not. - * This still avoids useless tlb flushes for .text page faults - * with threads. - */ - if (vmf->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, - vmf->pte); - } + else + fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -5860,7 +6320,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; @@ -5875,8 +6335,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5884,7 +6343,7 @@ retry_pud: pud_t orig_pud = *vmf.pud; barrier(); - if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + if (pud_trans_huge(orig_pud)) { /* * TODO once we support anonymous PUDs: NUMA case and @@ -5910,37 +6369,45 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) + if (ret & VM_FAULT_FALLBACK) + goto fallback; + else return ret; - } else { - vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); + } - if (unlikely(is_swap_pmd(vmf.orig_pmd))) { - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(vmf.orig_pmd)); - if (is_pmd_migration_entry(vmf.orig_pmd)) - pmd_migration_entry_wait(mm, vmf.pmd); - return 0; - } - if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { - if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&vmf); + vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); + if (pmd_none(vmf.orig_pmd)) + goto fallback; - if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !pmd_write(vmf.orig_pmd)) { - ret = wp_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; - } else { - huge_pmd_set_accessed(&vmf); - return 0; - } + if (unlikely(!pmd_present(vmf.orig_pmd))) { + if (pmd_is_device_private_entry(vmf.orig_pmd)) + return do_huge_pmd_device_private(&vmf); + + if (pmd_is_migration_entry(vmf.orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } + if (pmd_trans_huge(vmf.orig_pmd)) { + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + return do_huge_pmd_numa_page(&vmf); + + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !pmd_write(vmf.orig_pmd)) { + ret = wp_huge_pmd(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + vmf.ptl = pmd_lock(mm, vmf.pmd); + if (!huge_pmd_set_accessed(&vmf)) + fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD); + spin_unlock(vmf.ptl); + return 0; } } +fallback: return handle_pte_fault(&vmf); } @@ -6069,7 +6536,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, } /* - * By the time we get here, we already hold the mm semaphore + * By the time we get here, we already hold either the VMA lock or the + * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). @@ -6141,173 +6609,6 @@ out: } EXPORT_SYMBOL_GPL(handle_mm_fault); -#ifdef CONFIG_LOCK_MM_AND_FIND_VMA -#include <linux/extable.h> - -static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - if (likely(mmap_read_trylock(mm))) - return true; - - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - - return !mmap_read_lock_killable(mm); -} - -static inline bool mmap_upgrade_trylock(struct mm_struct *mm) -{ - /* - * We don't have this operation yet. - * - * It should be easy enough to do: it's basically a - * atomic_long_try_cmpxchg_acquire() - * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but - * it also needs the proper lockdep magic etc. - */ - return false; -} - -static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - mmap_read_unlock(mm); - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - return !mmap_write_lock_killable(mm); -} - -/* - * Helper for page fault handling. - * - * This is kind of equivalend to "mmap_read_lock()" followed - * by "find_extend_vma()", except it's a lot more careful about - * the locking (and will drop the lock on failure). - * - * For example, if we have a kernel bug that causes a page - * fault, we don't want to just use mmap_read_lock() to get - * the mm lock, because that would deadlock if the bug were - * to happen while we're holding the mm lock for writing. - * - * So this checks the exception tables on kernel faults in - * order to only do this all for instructions that are actually - * expected to fault. - * - * We can also actually take the mm lock for writing if we - * need to extend the vma, which helps the VM layer a lot. - */ -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, - unsigned long addr, struct pt_regs *regs) -{ - struct vm_area_struct *vma; - - if (!get_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (likely(vma && (vma->vm_start <= addr))) - return vma; - - /* - * Well, dang. We might still be successful, but only - * if we can extend a vma to do so. - */ - if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { - mmap_read_unlock(mm); - return NULL; - } - - /* - * We can try to upgrade the mmap lock atomically, - * in which case we can continue to use the vma - * we already looked up. - * - * Otherwise we'll have to drop the mmap lock and - * re-take it, and also look up the vma again, - * re-checking it. - */ - if (!mmap_upgrade_trylock(mm)) { - if (!upgrade_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (!vma) - goto fail; - if (vma->vm_start <= addr) - goto success; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto fail; - } - - if (expand_stack_locked(vma, addr)) - goto fail; - -success: - mmap_write_downgrade(mm); - return vma; - -fail: - mmap_write_unlock(mm); - return NULL; -} -#endif - -#ifdef CONFIG_PER_VMA_LOCK -/* - * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be - * stable and not isolated. If the VMA is not found or is being modified the - * function returns NULL. - */ -struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, - unsigned long address) -{ - MA_STATE(mas, &mm->mm_mt, address, address); - struct vm_area_struct *vma; - - rcu_read_lock(); -retry: - vma = mas_walk(&mas); - if (!vma) - goto inval; - - if (!vma_start_read(vma)) - goto inval; - - /* Check if the VMA got isolated after we found it */ - if (vma->detached) { - vma_end_read(vma); - count_vm_vma_lock_event(VMA_LOCK_MISS); - /* The area was replaced with another one */ - goto retry; - } - /* - * At this point, we have a stable reference to a VMA: The VMA is - * locked and we know it hasn't already been isolated. - * From here on, we can access the VMA without worrying about which - * fields are accessible for RCU readers. - */ - - /* Check since vm_start/vm_end might change before we lock the VMA */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; - - rcu_read_unlock(); - return vma; - -inval_end_read: - vma_end_read(vma); -inval: - rcu_read_unlock(); - count_vm_vma_lock_event(VMA_LOCK_ABORT); - return NULL; -} -#endif /* CONFIG_PER_VMA_LOCK */ - #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. @@ -6388,6 +6689,7 @@ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, args->lock = lock; args->ptep = ptep; args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); + args->addr_mask = addr_mask; args->pgprot = pgprot; args->writable = writable; args->special = special; @@ -6463,12 +6765,12 @@ retry: goto out; p4dp = p4d_offset(pgdp, address); - p4d = READ_ONCE(*p4dp); + p4d = p4dp_get(p4dp); if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; pudp = pud_offset(p4dp, address); - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (pud_none(pud)) goto out; if (pud_leaf(pud)) { @@ -6547,7 +6849,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; - unsigned long prot = 0; + pgprot_t prot = __pgprot(0); void __iomem *maddr; int offset = offset_in_page(addr); int ret = -EINVAL; @@ -6557,7 +6859,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, retry: if (follow_pfnmap_start(&args)) return -EINVAL; - prot = pgprot_val(args.pgprot); + prot = args.pgprot; phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; writable = args.writable; follow_pfnmap_end(&args); @@ -6572,7 +6874,7 @@ retry: if (follow_pfnmap_start(&args)) goto out_unmap; - if ((prot != pgprot_val(args.pgprot)) || + if ((pgprot_val(prot) != pgprot_val(args.pgprot)) || (phys_addr != (args.pfn << PAGE_SHIFT)) || (writable != args.writable)) { follow_pfnmap_end(&args); @@ -6617,6 +6919,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, while (len) { int bytes, offset; void *maddr; + struct folio *folio; struct vm_area_struct *vma = NULL; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); @@ -6648,21 +6951,22 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, if (bytes <= 0) break; } else { + folio = page_folio(page); bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; - maddr = kmap_local_page(page); + maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); - set_page_dirty_lock(page); + folio_mark_dirty_lock(folio); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); } len -= bytes; buf += bytes; @@ -6714,6 +7018,126 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, } EXPORT_SYMBOL_GPL(access_process_vm); +#ifdef CONFIG_BPF_SYSCALL +/* + * Copy a string from another process's address space as given in mm. + * If there is any error return -EFAULT. + */ +static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + void *old_buf = buf; + int err = 0; + + *(char *)buf = '\0'; + + if (mmap_read_lock_killable(mm)) + return -EFAULT; + + addr = untagged_addr_remote(mm, addr); + + /* Avoid triggering the temporary warning in __get_user_pages */ + if (!vma_lookup(mm, addr)) { + err = -EFAULT; + goto out; + } + + while (len) { + int bytes, offset, retval; + void *maddr; + struct folio *folio; + struct page *page; + struct vm_area_struct *vma = NULL; + + page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); + if (IS_ERR(page)) { + /* + * Treat as a total failure for now until we decide how + * to handle the CONFIG_HAVE_IOREMAP_PROT case and + * stack expansion. + */ + *(char *)buf = '\0'; + err = -EFAULT; + goto out; + } + + folio = page_folio(page); + bytes = len; + offset = addr & (PAGE_SIZE - 1); + if (bytes > PAGE_SIZE - offset) + bytes = PAGE_SIZE - offset; + + maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE); + retval = strscpy(buf, maddr + offset, bytes); + if (retval >= 0) { + /* Found the end of the string */ + buf += retval; + folio_release_kmap(folio, maddr); + break; + } + + buf += bytes - 1; + /* + * Because strscpy always NUL terminates we need to + * copy the last byte in the page if we are going to + * load more pages + */ + if (bytes != len) { + addr += bytes - 1; + copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1); + buf += 1; + addr += 1; + } + len -= bytes; + + folio_release_kmap(folio, maddr); + } + +out: + mmap_read_unlock(mm); + if (err) + return err; + return buf - old_buf; +} + +/** + * copy_remote_vm_str - copy a string from another process's address space. + * @tsk: the task of the target address space + * @addr: start address to read from + * @buf: destination buffer + * @len: number of bytes to copy + * @gup_flags: flags modifying lookup behaviour + * + * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from @addr (source) to @buf (destination); + * not including the trailing NUL. Always guaranteed to leave NUL-terminated + * buffer. On any error, return -EFAULT. + */ +int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + if (unlikely(len == 0)) + return 0; + + mm = get_task_mm(tsk); + if (!mm) { + *(char *)buf = '\0'; + return -EFAULT; + } + + ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags); + + mmput(mm); + + return ret; +} +EXPORT_SYMBOL_GPL(copy_remote_vm_str); +#endif /* CONFIG_BPF_SYSCALL */ + /* * Print the name of a VMA. */ @@ -6746,10 +7170,8 @@ void __might_fault(const char *file, int line) if (pagefault_disabled()) return; __might_sleep(file, line); -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); -#endif } EXPORT_SYMBOL(__might_fault); #endif @@ -6815,9 +7237,10 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct folio *folio, unsigned long addr, +static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, unsigned int nr_pages) { + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); int i; might_sleep(); @@ -6851,13 +7274,14 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint) } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, - unsigned long addr, + unsigned long addr_hint, struct vm_area_struct *vma, unsigned int nr_pages) { - int i; + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst)); struct page *dst_page; struct page *src_page; + int i; for (i = 0; i < nr_pages; i++) { dst_page = folio_page(dst, i); @@ -6959,7 +7383,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc) void ptlock_free(struct ptdesc *ptdesc) { - kmem_cache_free(page_ptl_cachep, ptdesc->ptl); + if (ptdesc->ptl) + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif |
