diff options
Diffstat (limited to 'mm/memory.c')
| -rw-r--r-- | mm/memory.c | 2871 |
1 files changed, 1902 insertions, 969 deletions
diff --git a/mm/memory.c b/mm/memory.c index d10e616d7389..2a55edc48a65 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1,4 +1,3 @@ - // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c @@ -44,7 +43,6 @@ #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> -#include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> @@ -59,15 +57,15 @@ #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> -#include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/elf.h> #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> +#include <linux/shmem_fs.h> #include <linux/memory-tiers.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> @@ -78,13 +76,13 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> +#include <linux/pgalloc.h> +#include <linux/uaccess.h> #include <trace/events/kmem.h> #include <asm/io.h> #include <asm/mmu_context.h> -#include <asm/pgalloc.h> -#include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> @@ -96,14 +94,6 @@ #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif -#ifndef CONFIG_NUMA -unsigned long max_mapnr; -EXPORT_SYMBOL(max_mapnr); - -struct page *mem_map; -EXPORT_SYMBOL(mem_map); -#endif - static vm_fault_t do_fault(struct vm_fault *vmf); static vm_fault_t do_anonymous_page(struct vm_fault *vmf); static bool vmf_pte_changed(struct vm_fault *vmf); @@ -119,18 +109,10 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return false; - return pte_marker_uffd_wp(vmf->orig_pte); + return pte_is_uffd_wp_marker(vmf->orig_pte); } /* - * A number of key systems in x86 including ioremap() rely on the assumption - * that high_memory defines the upper bound on direct map memory, then end - * of ZONE_NORMAL. - */ -void *high_memory; -EXPORT_SYMBOL(high_memory); - -/* * Randomize the address space (stacks, mmaps, brk, etc.). * * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, @@ -143,6 +125,24 @@ int randomize_va_space __read_mostly = 2; #endif +static const struct ctl_table mmu_sysctl_table[] = { + { + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_mm_sysctl(void) +{ + register_sysctl_init("kernel", mmu_sysctl_table); + return 0; +} + +subsys_initcall(init_mm_sysctl); + #ifndef arch_wants_old_prefaulted_pte static inline bool arch_wants_old_prefaulted_pte(void) { @@ -296,8 +296,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, p4d_free_tlb(tlb, p4d, start); } -/* - * This function frees user-level page tables of a process. +/** + * free_pgd_range - Unmap and free page tables in the range + * @tlb: the mmu_gather containing pending TLB flush info + * @addr: virtual address start + * @end: virtual address end + * @floor: lowest address boundary + * @ceiling: highest address boundary + * + * This function tears down all user-level page tables in the + * specified virtual address range [@addr..@end). It is part of + * the memory unmap flow. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -365,6 +374,10 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked) { + struct unlink_vma_file_batch vb; + + tlb_free_vmas(tlb); + do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; @@ -384,29 +397,27 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); - unlink_file_vma(vma); - if (is_vm_hugetlb_page(vma)) { - hugetlb_free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); - } else { - /* - * Optimization: gather nearby vmas into one call down - */ - while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(next)) { - vma = next; - next = mas_find(mas, ceiling - 1); - if (unlikely(xa_is_zero(next))) - next = NULL; - if (mm_wr_locked) - vma_start_write(vma); - unlink_anon_vmas(vma); - unlink_file_vma(vma); - } - free_pgd_range(tlb, addr, vma->vm_end, - floor, next ? next->vm_start : ceiling); + unlink_file_vma_batch_init(&vb); + unlink_file_vma_batch_add(&vb, vma); + + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { + vma = next; + next = mas_find(mas, ceiling - 1); + if (unlikely(xa_is_zero(next))) + next = NULL; + if (mm_wr_locked) + vma_start_write(vma); + unlink_anon_vmas(vma); + unlink_file_vma_batch_add(&vb, vma); } + unlink_file_vma_batch_final(&vb); + + free_pgd_range(tlb, addr, vma->vm_end, + floor, next ? next->vm_start : ceiling); vma = next; } while (vma); } @@ -481,22 +492,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) add_mm_counter(mm, i, rss[i]); } -/* - * This function is called to print an error when a bad pte - * is found. For example, we might have a PFN-mapped pte in - * a region that doesn't allow it. - * - * The calling function must still handle the error. - */ -static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, struct page *page) +static bool is_bad_page_map_ratelimited(void) { - pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - p4d_t *p4d = p4d_offset(pgd, addr); - pud_t *pud = pud_offset(p4d, addr); - pmd_t *pmd = pmd_offset(pud, addr); - struct address_space *mapping; - pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; @@ -508,7 +505,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; - return; + return true; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", @@ -519,37 +516,135 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; + return false; +} + +static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr) +{ + unsigned long long pgdv, p4dv, pudv, pmdv; + p4d_t p4d, *p4dp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pgd_t *pgdp; + + /* + * Although this looks like a fully lockless pgtable walk, it is not: + * see locking requirements for print_bad_page_map(). + */ + pgdp = pgd_offset(mm, addr); + pgdv = pgd_val(*pgdp); + + if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) { + pr_alert("pgd:%08llx\n", pgdv); + return; + } + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + p4dv = p4d_val(p4d); + + if (!p4d_present(p4d) || p4d_leaf(p4d)) { + pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv); + return; + } + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + pudv = pud_val(pud); + + if (!pud_present(pud) || pud_leaf(pud)) { + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv); + return; + } + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + pmdv = pmd_val(pmd); + + /* + * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE, + * because the table should already be mapped by the caller and + * doing another map would be bad. print_bad_page_map() should + * already take care of printing the PTE. + */ + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv, + p4dv, pudv, pmdv); +} + +/* + * This function is called to print an error when a bad page table entry (e.g., + * corrupted page table entry) is found. For example, we might have a + * PFN-mapped pte in a region that doesn't allow it. + * + * The calling function must still handle the error. + * + * This function must be called during a proper page table walk, as it will + * re-walk the page table to dump information: the caller MUST prevent page + * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf + * page table lock. + */ +static void print_bad_page_map(struct vm_area_struct *vma, + unsigned long addr, unsigned long long entry, struct page *page, + enum pgtable_level level) +{ + struct address_space *mapping; + pgoff_t index; + + if (is_bad_page_map_ratelimited()) + return; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm, + pgtable_level_to_str(level), entry); + __print_bad_page_map_pgtable(vma->vm_mm, addr); if (page) - dump_page(page, "bad pte"); + dump_page(page, "bad page map"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", + pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, + vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL, mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +#define print_bad_pte(vma, addr, pte, page) \ + print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE) -/* - * vm_normal_page -- This function gets the "struct page" associated with a pte. +/** + * __vm_normal_page() - Get the "struct page" associated with a page table entry. + * @vma: The VMA mapping the page table entry. + * @addr: The address where the page table entry is mapped. + * @pfn: The PFN stored in the page table entry. + * @special: Whether the page table entry is marked "special". + * @level: The page table level for error reporting purposes only. + * @entry: The page table entry value for error reporting purposes only. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this - * case, NULL is returned here. "Normal" mappings do have a struct page. + * case, NULL is returned here. "Normal" mappings do have a struct page and + * are ordinarily refcounted. * - * There are 2 broad cases. Firstly, an architecture may define a pte_special() - * pte bit, in which case this function is trivial. Secondly, an architecture - * may not have a spare pte bit, which requires a more complicated scheme, - * described below. + * Page mappings of the shared zero folios are always considered "special", as + * they are not ordinarily refcounted: neither the refcount nor the mapcount + * of these folios is adjusted when mapping them into user page tables. + * Selected page table walkers (such as GUP) can still identify mappings of the + * shared zero folios and work with the underlying "struct page". + * + * There are 2 broad cases. Firstly, an architecture may define a "special" + * page table entry bit, such as pte_special(), in which case this function is + * trivial. Secondly, an architecture may not have a spare page table + * entry bit, which requires a more complicated scheme, described below. + * + * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on + * page table entries that actually map "normal" pages: however, that page + * cannot be looked up through the PFN stored in the page table entry, but + * instead will be looked up through vm_ops->find_normal_page(). So far, this + * only applies to PTEs. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). @@ -574,76 +669,104 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct - * page (that is, those where pfn_valid is true) are refcounted and considered - * normal pages by the VM. The disadvantage is that pages are refcounted - * (which can be slower and simply not an option for some PFNMAP users). The - * advantage is that we don't have to follow the strict linearity rule of - * PFNMAP mappings in order to support COWable mappings. + * page (that is, those where pfn_valid is true, except the shared zero + * folios) are refcounted and considered normal pages by the VM. * + * The disadvantage is that pages are refcounted (which can be slower and + * simply not an option for some PFNMAP users). The advantage is that we + * don't have to follow the strict linearity rule of PFNMAP mappings in + * order to support COWable mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. */ -struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte) +static inline struct page *__vm_normal_page(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, bool special, + unsigned long long entry, enum pgtable_level level) { - unsigned long pfn = pte_pfn(pte); - if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { - if (likely(!pte_special(pte))) - goto check_pfn; - if (vma->vm_ops && vma->vm_ops->find_special_page) - return vma->vm_ops->find_special_page(vma, addr); - if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return NULL; - if (is_zero_pfn(pfn)) + if (unlikely(special)) { +#ifdef CONFIG_FIND_NORMAL_PAGE + if (vma->vm_ops && vma->vm_ops->find_normal_page) + return vma->vm_ops->find_normal_page(vma, addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + return NULL; + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) + return NULL; + + print_bad_page_map(vma, addr, entry, NULL, level); return NULL; - if (pte_devmap(pte)) + } /* - * NOTE: New users of ZONE_DEVICE will not set pte_devmap() - * and will have refcounts incremented on their struct pages - * when they are inserted into PTEs, thus they are safe to - * return here. Legacy ZONE_DEVICE pages that set pte_devmap() - * do not have refcounts. Example of legacy ZONE_DEVICE is - * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. + * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table + * mappings (incl. shared zero folios) are marked accordingly. */ - return NULL; - - print_bad_pte(vma, addr, pte, NULL); - return NULL; - } - - /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ + } else { + if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + /* If it has a "struct page", it's "normal". */ + if (!pfn_valid(pfn)) + return NULL; + } else { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; + /* Only CoW'ed anon folios are "normal". */ + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } } - } - if (is_zero_pfn(pfn)) - return NULL; + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) + return NULL; + } -check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { - print_bad_pte(vma, addr, pte, NULL); + /* Corrupted page table entry. */ + print_bad_page_map(vma, addr, entry, NULL, level); return NULL; } - /* * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. + * For example, VDSO mappings can cause them to exist. */ -out: + VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)); return pfn_to_page(pfn); } +/** + * vm_normal_page() - Get the "struct page" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte), + pte_val(pte), PGTABLE_LEVEL_PTE); +} + +/** + * vm_normal_folio() - Get the "struct folio" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { @@ -654,47 +777,38 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, return NULL; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/** + * vm_normal_page_pmd() - Get the "struct page" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { - unsigned long pfn = pmd_pfn(pmd); - - /* - * There is no pmd_special() but there may be special pmds, e.g. - * in a direct-access (dax) mapping, so let's just replicate the - * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. - */ - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; - } - } - - if (pmd_devmap(pmd)) - return NULL; - if (is_huge_zero_pmd(pmd)) - return NULL; - if (unlikely(pfn > highest_memmap_pfn)) - return NULL; - - /* - * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. - */ -out: - return pfn_to_page(pfn); + return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd), + pmd_val(pmd), PGTABLE_LEVEL_PMD); } +/** + * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { @@ -704,44 +818,74 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, return page_folio(page); return NULL; } + +/** + * vm_normal_page_pud() - Get the "struct page" associated with a PUD + * @vma: The VMA mapping the @pud. + * @addr: The address where the @pud is mapped. + * @pud: The PUD. + * + * Get the "struct page" associated with a PUD. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t pud) +{ + return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud), + pud_val(pud), PGTABLE_LEVEL_PUD); +} #endif +/** + * restore_exclusive_pte - Restore a device-exclusive entry + * @vma: VMA covering @address + * @folio: the mapped folio + * @page: the mapped folio page + * @address: the virtual address + * @ptep: pte pointer into the locked page table mapping the folio page + * @orig_pte: pte value at @ptep + * + * Restore a device-exclusive non-swap entry to an ordinary present pte. + * + * The folio and the page table must be locked, and MMU notifiers must have + * been called to invalidate any (exclusive) device mappings. + * + * Locking the folio makes sure that anybody who just converted the pte to + * a device-exclusive entry can map it into the device to make forward + * progress without others converting it back until the folio was unlocked. + * + * If the folio lock ever becomes an issue, we can stop relying on the folio + * lock; it might make some scenarios with heavy thrashing less likely to + * make forward progress, but these scenarios might not be valid use cases. + * + * Note that the folio lock does not protect against all cases of concurrent + * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers + * must use MMU notifiers to sync against any concurrent changes. + */ static void restore_exclusive_pte(struct vm_area_struct *vma, - struct page *page, unsigned long address, - pte_t *ptep) + struct folio *folio, struct page *page, unsigned long address, + pte_t *ptep, pte_t orig_pte) { - struct folio *folio = page_folio(page); - pte_t orig_pte; pte_t pte; - swp_entry_t entry; - orig_pte = ptep_get(ptep); + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - entry = pte_to_swp_entry(orig_pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_mkuffd_wp(pte); - else if (is_writable_device_exclusive_entry(entry)) - pte = maybe_mkwrite(pte_mkdirty(pte), vma); - - VM_BUG_ON_FOLIO(pte_write(pte) && (!folio_test_anon(folio) && - PageAnonExclusive(page)), folio); - - /* - * No need to take a page reference as one was already - * created when the swap entry was made. - */ - if (folio_test_anon(folio)) - folio_add_anon_rmap_pte(folio, page, vma, address, RMAP_NONE); - else - /* - * Currently device exclusive access only supports anonymous - * memory so the entry shouldn't point to a filebacked page. - */ - WARN_ON_ONCE(1); + if ((vma->vm_flags & VM_WRITE) && + can_change_pte_writable(vma, address, pte)) { + if (folio_test_dirty(folio)) + pte = pte_mkdirty(pte); + pte = pte_mkwrite(pte, vma); + } set_pte_at(vma->vm_mm, address, ptep, pte); /* @@ -755,16 +899,16 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, * Tries to restore an exclusive pte if the page lock can be acquired without * sleeping. */ -static int -try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr) +static int try_restore_exclusive_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, pte_t orig_pte) { - swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte)); - struct page *page = pfn_swap_entry_to_page(entry); + const softleaf_t entry = softleaf_from_pte(orig_pte); + struct page *page = softleaf_to_page(entry); + struct folio *folio = page_folio(page); - if (trylock_page(page)) { - restore_exclusive_pte(vma, page, addr, src_pte); - unlock_page(page); + if (folio_trylock(folio)) { + restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte); + folio_unlock(folio); return 0; } @@ -782,14 +926,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { - unsigned long vm_flags = dst_vma->vm_flags; + vm_flags_t vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); + softleaf_t entry = softleaf_from_pte(orig_pte); pte_t pte = orig_pte; struct folio *folio; struct page *page; - swp_entry_t entry = pte_to_swp_entry(orig_pte); - if (likely(!non_swap_entry(entry))) { + if (likely(softleaf_is_swap(entry))) { if (swap_duplicate(entry) < 0) return -EIO; @@ -807,12 +951,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pte_at(src_mm, addr, src_pte, pte); } rss[MM_SWAPENTS]++; - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); + } else if (softleaf_is_migration(entry)) { + folio = softleaf_to_folio(entry); rss[mm_counter(folio)]++; - if (!is_readable_migration_entry(entry) && + if (!softleaf_is_migration_read(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent and child @@ -821,15 +965,15 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ entry = make_readable_migration_entry( swp_offset(entry)); - pte = swp_entry_to_pte(entry); + pte = softleaf_to_pte(entry); if (pte_swp_soft_dirty(orig_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(orig_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } - } else if (is_device_private_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + } else if (softleaf_is_device_private(entry)) { + page = softleaf_to_page(entry); folio = page_folio(page); /* @@ -844,7 +988,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, folio_get(folio); rss[mm_counter(folio)]++; /* Cannot fail as these pages cannot get pinned. */ - folio_try_dup_anon_rmap_pte(folio, page, src_vma); + folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma); /* * We do not preserve soft-dirty information, because so @@ -853,7 +997,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * when a device driver is involved (you cannot easily * save and restore device driver state). */ - if (is_writable_device_private_entry(entry) && + if (softleaf_is_device_private_write(entry) && is_cow_mapping(vm_flags)) { entry = make_readable_device_private_entry( swp_offset(entry)); @@ -862,7 +1006,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } - } else if (is_device_exclusive_entry(entry)) { + } else if (softleaf_is_device_exclusive(entry)) { /* * Make device exclusive entries present by restoring the * original entry then copying as for a present pte. Device @@ -870,10 +1014,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, * (ie. COW) mappings. */ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); - if (try_restore_exclusive_pte(src_pte, src_vma, addr)) + if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte)) return -EBUSY; return -ENOENT; - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { pte_marker marker = copy_pte_marker(entry, dst_vma); if (marker) @@ -915,15 +1059,18 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * We have a prealloc page, all good! Take it * over and copy the page & arm it. */ + + if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma)) + return -EHWPOISON; + *prealloc = NULL; - copy_user_highpage(&new_folio->page, page, addr, src_vma); __folio_mark_uptodate(new_folio); - folio_add_new_anon_rmap(new_folio, dst_vma, addr); + folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, dst_vma); rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); + pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte))) /* Uffd-wp needs to be delivered to dest pte as well */ @@ -967,10 +1114,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int max_nr, int *rss, struct folio **prealloc) { + fpb_t flags = FPB_MERGE_WRITE; struct page *page; struct folio *folio; - bool any_writable; - fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); @@ -985,28 +1131,25 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { - if (src_vma->vm_flags & VM_SHARED) - flags |= FPB_IGNORE_DIRTY; - if (!vma_soft_dirty_enabled(src_vma)) - flags |= FPB_IGNORE_SOFT_DIRTY; + if (!(src_vma->vm_flags & VM_SHARED)) + flags |= FPB_RESPECT_DIRTY; + if (vma_soft_dirty_enabled(src_vma)) + flags |= FPB_RESPECT_SOFT_DIRTY; - nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable, NULL, NULL); + nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, - nr, src_vma))) { + nr, dst_vma, src_vma))) { folio_ref_sub(folio, nr); return -EAGAIN; } rss[MM_ANONPAGES] += nr; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { - folio_dup_file_rmap_ptes(folio, page, nr); + folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); rss[mm_counter_file(folio)] += nr; } - if (any_writable) - pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; @@ -1020,7 +1163,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * guarantee the pinned page won't be randomly replaced in the * future. */ - if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) { + if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) { /* Page may be pinned, we have to copy. */ folio_put(folio); err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, @@ -1030,7 +1173,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma rss[MM_ANONPAGES]++; VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio); } else { - folio_dup_file_rmap_pte(folio, page); + folio_dup_file_rmap_pte(folio, page, dst_vma); rss[mm_counter_file(folio)]++; } @@ -1047,8 +1190,7 @@ static inline struct folio *folio_prealloc(struct mm_struct *src_mm, if (need_zero) new_folio = vma_alloc_zeroed_movable_folio(vma, addr); else - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - addr, false); + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (!new_folio) return NULL; @@ -1071,11 +1213,12 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; + pmd_t dummy_pmdval; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; int rss[NR_MM_COUNTERS]; - swp_entry_t entry = (swp_entry_t){0}; + softleaf_t entry = softleaf_mk_none(); struct folio *prealloc = NULL; int nr; @@ -1096,7 +1239,15 @@ again: ret = -ENOMEM; goto out; } - src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl); + + /* + * We already hold the exclusive mmap_lock, the copy_pte_range() and + * retract_page_tables() are using vma->anon_vma to be exclusive, so + * the PTE page is stable, and there is no need to get pmdval and do + * pmd_same() check. + */ + src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval, + &src_ptl); if (!src_pte) { pte_unmap_unlock(dst_pte, dst_ptl); /* ret == 0 */ @@ -1131,7 +1282,7 @@ again: dst_vma, src_vma, addr, rss); if (ret == -EIO) { - entry = pte_to_swp_entry(ptep_get(src_pte)); + entry = softleaf_from_pte(ptep_get(src_pte)); break; } else if (ret == -EBUSY) { break; @@ -1155,8 +1306,9 @@ again: /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. + * If copy failed due to hwpoison in source page, break out. */ - if (unlikely(ret == -EAGAIN)) + if (unlikely(ret == -EAGAIN || ret == -EHWPOISON)) break; if (unlikely(prealloc)) { /* @@ -1186,7 +1338,7 @@ again: goto out; } entry.val = 0; - } else if (ret == -EBUSY) { + } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) { goto out; } else if (ret == -EAGAIN) { prealloc = folio_prealloc(src_mm, src_vma, addr, false); @@ -1223,9 +1375,9 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) - || pmd_devmap(*src_pmd)) { + if (pmd_is_huge(*src_pmd)) { int err; + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma); @@ -1260,7 +1412,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { + if (pud_trans_huge(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); @@ -1313,18 +1465,12 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { + if (src_vma->vm_flags & VM_COPY_ON_FORK) + return true; /* - * Always copy pgtables when dst_vma has uffd-wp enabled even if it's - * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable - * contains uffd-wp protection information, that's something we can't - * retrieve from page cache, and skip copying will lose those info. + * The presence of an anon_vma indicates an anonymous VMA has page + * tables which naturally cannot be reconstituted on page fault. */ - if (userfaultfd_wp(dst_vma)) - return true; - - if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return true; - if (src_vma->anon_vma) return true; @@ -1341,12 +1487,12 @@ int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; - unsigned long next; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; + unsigned long next; bool is_cow; int ret; @@ -1356,16 +1502,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); - if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { - /* - * We do not free on error cases below as remove_vma - * gets called on error from higher level routine - */ - ret = track_pfn_copy(src_vma); - if (ret) - return ret; - } - /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the @@ -1398,7 +1534,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { - untrack_pfn_clear(dst_vma); ret = -ENOMEM; break; } @@ -1415,7 +1550,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details) + if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ @@ -1434,7 +1569,7 @@ static inline bool should_zap_folio(struct zap_details *details, return !folio_test_anon(folio); } -static inline bool zap_drop_file_uffd_wp(struct zap_details *details) +static inline bool zap_drop_markers(struct zap_details *details) { if (!details) return false; @@ -1445,34 +1580,44 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details) /* * This function makes sure that we'll replace the none pte with an uffd-wp * swap special pte marker when necessary. Must be with the pgtable lock held. + * + * Returns true if uffd-wp ptes was installed, false otherwise. */ -static inline void +static inline bool zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, int nr, struct zap_details *details, pte_t pteval) { + bool was_installed = false; + + if (!uffd_supports_wp_marker()) + return false; + /* Zap on anonymous always means dropping everything */ if (vma_is_anonymous(vma)) - return; + return false; - if (zap_drop_file_uffd_wp(details)) - return; + if (zap_drop_markers(details)) + return false; for (;;) { /* the PFN in the PTE is irrelevant. */ - pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); + if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval)) + was_installed = true; if (--nr == 0) break; pte++; addr += PAGE_SIZE; } + + return was_installed; } static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio, struct page *page, pte_t *pte, pte_t ptent, unsigned int nr, unsigned long addr, struct zap_details *details, int *rss, - bool *force_flush, bool *force_break) + bool *force_flush, bool *force_break, bool *any_skipped) { struct mm_struct *mm = tlb->mm; bool delay_rmap = false; @@ -1498,8 +1643,8 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entries(tlb, pte, nr, addr); if (unlikely(userfaultfd_pte_wp(vma, ptent))) - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, - ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, + nr, details, ptent); if (!delay_rmap) { folio_remove_rmap_ptes(folio, page, nr, vma); @@ -1523,9 +1668,8 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma, pte_t *pte, pte_t ptent, unsigned int max_nr, unsigned long addr, struct zap_details *details, int *rss, bool *force_flush, - bool *force_break) + bool *force_break, bool *any_skipped) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; @@ -1538,34 +1682,141 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); if (userfaultfd_pte_wp(vma, ptent)) - zap_install_uffd_wp_if_needed(vma, addr, pte, 1, - details, ptent); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, + pte, 1, details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) + if (unlikely(!should_zap_folio(details, folio))) { + *any_skipped = true; return 1; + } /* * Make sure that the common "small folio" case is as fast as possible * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, - NULL, NULL, NULL); - + nr = folio_pte_batch(folio, pte, ptent, max_nr); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, - force_break); + force_break, any_skipped); return nr; } zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr, - details, rss, force_flush, force_break); + details, rss, force_flush, force_break, any_skipped); return 1; } +static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, pte_t ptent, + unsigned int max_nr, unsigned long addr, + struct zap_details *details, int *rss, bool *any_skipped) +{ + softleaf_t entry; + int nr = 1; + + *any_skipped = true; + entry = softleaf_from_pte(ptent); + if (softleaf_is_device_private(entry) || + softleaf_is_device_exclusive(entry)) { + struct page *page = softleaf_to_page(entry); + struct folio *folio = page_folio(page); + + if (unlikely(!should_zap_folio(details, folio))) + return 1; + /* + * Both device private/exclusive mappings should only + * work with anonymous page so far, so we don't need to + * consider uffd-wp bit when zap. For more information, + * see zap_install_uffd_wp_if_needed(). + */ + WARN_ON_ONCE(!vma_is_anonymous(vma)); + rss[mm_counter(folio)]--; + folio_remove_rmap_pte(folio, page, vma); + folio_put(folio); + } else if (softleaf_is_swap(entry)) { + /* Genuine swap entries, hence a private anon pages */ + if (!should_zap_cows(details)) + return 1; + + nr = swap_pte_batch(pte, max_nr, ptent); + rss[MM_SWAPENTS] -= nr; + free_swap_and_cache_nr(entry, nr); + } else if (softleaf_is_migration(entry)) { + struct folio *folio = softleaf_to_folio(entry); + + if (!should_zap_folio(details, folio)) + return 1; + rss[mm_counter(folio)]--; + } else if (softleaf_is_uffd_wp_marker(entry)) { + /* + * For anon: always drop the marker; for file: only + * drop the marker if explicitly requested. + */ + if (!vma_is_anonymous(vma) && !zap_drop_markers(details)) + return 1; + } else if (softleaf_is_guard_marker(entry)) { + /* + * Ordinary zapping should not remove guard PTE + * markers. Only do so if we should remove PTE markers + * in general. + */ + if (!zap_drop_markers(details)) + return 1; + } else if (softleaf_is_hwpoison(entry) || + softleaf_is_poison_marker(entry)) { + if (!should_zap_cows(details)) + return 1; + } else { + /* We should have covered all the swap entry types */ + pr_alert("unrecognized swap entry 0x%lx\n", entry.val); + WARN_ON_ONCE(1); + } + clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm); + *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); + + return nr; +} + +static inline int do_zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, unsigned long end, + struct zap_details *details, int *rss, + bool *force_flush, bool *force_break, + bool *any_skipped) +{ + pte_t ptent = ptep_get(pte); + int max_nr = (end - addr) / PAGE_SIZE; + int nr = 0; + + /* Skip all consecutive none ptes */ + if (pte_none(ptent)) { + for (nr = 1; nr < max_nr; nr++) { + ptent = ptep_get(pte + nr); + if (!pte_none(ptent)) + break; + } + max_nr -= nr; + if (!max_nr) + return nr; + pte += nr; + addr += nr * PAGE_SIZE; + } + + if (pte_present(ptent)) + nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, force_flush, force_break, + any_skipped); + else + nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr, + details, rss, any_skipped); + + return nr; +} + static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -1577,9 +1828,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - swp_entry_t entry; + pmd_t pmdval; + unsigned long start = addr; + bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); + bool direct_reclaim = true; int nr; +retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1589,82 +1844,35 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { - pte_t ptent = ptep_get(pte); - struct folio *folio; - struct page *page; - int max_nr; + bool any_skipped = false; - nr = 1; - if (pte_none(ptent)) - continue; - - if (need_resched()) + if (need_resched()) { + direct_reclaim = false; break; - - if (pte_present(ptent)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr, - addr, details, rss, &force_flush, - &force_break); - if (unlikely(force_break)) { - addr += nr * PAGE_SIZE; - break; - } - continue; } - entry = pte_to_swp_entry(ptent); - if (is_device_private_entry(entry) || - is_device_exclusive_entry(entry)) { - page = pfn_swap_entry_to_page(entry); - folio = page_folio(page); - if (unlikely(!should_zap_folio(details, folio))) - continue; - /* - * Both device private/exclusive mappings should only - * work with anonymous page so far, so we don't need to - * consider uffd-wp bit when zap. For more information, - * see zap_install_uffd_wp_if_needed(). - */ - WARN_ON_ONCE(!vma_is_anonymous(vma)); - rss[mm_counter(folio)]--; - if (is_device_private_entry(entry)) - folio_remove_rmap_pte(folio, page, vma); - folio_put(folio); - } else if (!non_swap_entry(entry)) { - max_nr = (end - addr) / PAGE_SIZE; - nr = swap_pte_batch(pte, max_nr, ptent); - /* Genuine swap entries, hence a private anon pages */ - if (!should_zap_cows(details)) - continue; - rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); - } else if (is_migration_entry(entry)) { - folio = pfn_swap_entry_folio(entry); - if (!should_zap_folio(details, folio)) - continue; - rss[mm_counter(folio)]--; - } else if (pte_marker_entry_uffd_wp(entry)) { - /* - * For anon: always drop the marker; for file: only - * drop the marker if explicitly requested. - */ - if (!vma_is_anonymous(vma) && - !zap_drop_file_uffd_wp(details)) - continue; - } else if (is_hwpoison_entry(entry) || - is_poisoned_swp_entry(entry)) { - if (!should_zap_cows(details)) - continue; - } else { - /* We should have covered all the swap entry types */ - pr_alert("unrecognized swap entry 0x%lx\n", entry.val); - WARN_ON_ONCE(1); + nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, + &force_flush, &force_break, &any_skipped); + if (any_skipped) + can_reclaim_pt = false; + if (unlikely(force_break)) { + addr += nr * PAGE_SIZE; + direct_reclaim = false; + break; } - clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); - zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent); } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); + /* + * Fast path: try to hold the pmd lock and unmap the PTE page. + * + * If the pte lock was released midway (retry case), or if the attempt + * to hold the pmd lock failed, then we need to recheck all pte entries + * to ensure they are still none, thereby preventing the pte entries + * from being repopulated by another thread. + */ + if (can_reclaim_pt && direct_reclaim && addr == end) + direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1684,6 +1892,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (force_flush) tlb_flush_mmu(tlb); + if (addr != end) { + cond_resched(); + force_flush = false; + force_break = false; + goto retry; + } + + if (can_reclaim_pt) { + if (direct_reclaim) + free_pte(mm, start, tlb, pmdval); + else + try_to_free_pte(mm, pmd, start, tlb); + } + return addr; } @@ -1698,9 +1920,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (pmd_is_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) - __split_huge_pmd(vma, pmd, addr, false, NULL); + __split_huge_pmd(vma, pmd, addr, false); else if (zap_huge_pmd(tlb, vma, pmd, addr)) { addr = next; continue; @@ -1740,7 +1962,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_trans_huge(*pud) || pud_devmap(*pud)) { + if (pud_trans_huge(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); @@ -1800,8 +2022,7 @@ void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, - struct zap_details *details, bool mm_wr_locked) + unsigned long end_addr, struct zap_details *details) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; @@ -1815,9 +2036,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) uprobe_munmap(vma, start, end); - if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn(vma, 0, 0, mm_wr_locked); - if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* @@ -1850,7 +2068,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @tree_end: The maximum index to check - * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * @@ -1865,8 +2082,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end) { struct mmu_notifier_range range; struct zap_details details = { @@ -1882,8 +2098,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unsigned long start = start_addr; unsigned long end = end_addr; hugetlb_zap_begin(vma, &start, &end); - unmap_single_vma(tlb, vma, start, end, &details, - mm_wr_locked); + unmap_single_vma(tlb, vma, start, end, &details); hugetlb_zap_end(vma, &details); vma = mas_find(mas, tree_end - 1); } while (vma && likely(!xa_is_zero(vma))); @@ -1891,36 +2106,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, } /** - * zap_page_range_single - remove user pages in a given range + * zap_page_range_single_batched - remove user pages in a given range + * @tlb: pointer to the caller's struct mmu_gather * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap - * @size: number of bytes to zap + * @address: starting address of pages to remove + * @size: number of bytes to remove * @details: details of shared cache invalidation * - * The range must fit into one VMA. + * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for + * hugetlb, @tlb is flushed and re-initialized by this function. */ -void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_page_range_single_batched(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { const unsigned long end = address + size; struct mmu_notifier_range range; - struct mmu_gather tlb; - lru_add_drain(); + VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); hugetlb_zap_begin(vma, &range.start, &range.end); - tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); /* * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details, false); + unmap_single_vma(tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(&range); + if (is_vm_hugetlb_page(vma)) { + /* + * flush tlb and free resources before hugetlb_zap_end(), to + * avoid concurrent page faults' allocation failure. + */ + tlb_finish_mmu(tlb); + hugetlb_zap_end(vma, details); + tlb_gather_mmu(tlb, vma->vm_mm); + } +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * The range must fit into one VMA. + */ +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + zap_page_range_single_batched(&tlb, vma, address, size, details); tlb_finish_mmu(&tlb); - hugetlb_zap_end(vma, details); } /** @@ -1977,54 +2220,111 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, return pte_alloc_map_lock(mm, pmd, addr, ptl); } -static int validate_page_before_insert(struct page *page) +static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma) +{ + VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP); + /* + * Whoever wants to forbid the zeropage after some zeropages + * might already have been mapped has to scan the page tables and + * bail out on any zeropages. Zeropages in COW mappings can + * be unshared using FAULT_FLAG_UNSHARE faults. + */ + if (mm_forbids_zeropage(vma->vm_mm)) + return false; + /* zeropages in COW mappings are common and unproblematic. */ + if (is_cow_mapping(vma->vm_flags)) + return true; + /* Mappings that do not allow for writable PTEs are unproblematic. */ + if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) + return true; + /* + * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could + * find the shared zeropage and longterm-pin it, which would + * be problematic as soon as the zeropage gets replaced by a different + * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would + * now differ to what GUP looked up. FSDAX is incompatible to + * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see + * check_vma_flags). + */ + return vma->vm_ops && vma->vm_ops->pfn_mkwrite && + (vma_is_fsdax(vma) || vma->vm_flags & VM_IO); +} + +static int validate_page_before_insert(struct vm_area_struct *vma, + struct page *page) { struct folio *folio = page_folio(page); - if (folio_test_anon(folio) || folio_test_slab(folio) || - page_has_type(page)) + if (!folio_ref_count(folio)) + return -EINVAL; + if (unlikely(is_zero_folio(folio))) { + if (!vm_mixed_zeropage_allowed(vma)) + return -EINVAL; + return 0; + } + if (folio_test_anon(folio) || page_has_type(page)) return -EINVAL; flush_dcache_folio(folio); return 0; } static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, - unsigned long addr, struct page *page, pgprot_t prot) + unsigned long addr, struct page *page, + pgprot_t prot, bool mkwrite) { struct folio *folio = page_folio(page); + pte_t pteval = ptep_get(pte); + + if (!pte_none(pteval)) { + if (!mkwrite) + return -EBUSY; + + /* see insert_pfn(). */ + if (pte_pfn(pteval) != page_to_pfn(page)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval))); + return -EFAULT; + } + pteval = maybe_mkwrite(pteval, vma); + pteval = pte_mkyoung(pteval); + if (ptep_set_access_flags(vma, addr, pte, pteval, 1)) + update_mmu_cache(vma, addr, pte); + return 0; + } - if (!pte_none(ptep_get(pte))) - return -EBUSY; /* Ok, finally just insert the thing.. */ - folio_get(folio); - inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); - folio_add_file_rmap_pte(folio, page, vma); - set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); + pteval = mk_pte(page, prot); + if (unlikely(is_zero_folio(folio))) { + pteval = pte_mkspecial(pteval); + } else { + folio_get(folio); + pteval = mk_pte(page, prot); + if (mkwrite) { + pteval = pte_mkyoung(pteval); + pteval = maybe_mkwrite(pte_mkdirty(pteval), vma); + } + inc_mm_counter(vma->vm_mm, mm_counter_file(folio)); + folio_add_file_rmap_pte(folio, page, vma); + } + set_pte_at(vma->vm_mm, addr, pte, pteval); return 0; } -/* - * This is the old fallback for page remapping. - * - * For historical reasons, it only allows reserved pages. Only - * old drivers should use this, and they needed to mark their - * pages reserved for the old functions anyway. - */ static int insert_page(struct vm_area_struct *vma, unsigned long addr, - struct page *page, pgprot_t prot) + struct page *page, pgprot_t prot, bool mkwrite) { int retval; pte_t *pte; spinlock_t *ptl; - retval = validate_page_before_insert(page); + retval = validate_page_before_insert(vma, page); if (retval) goto out; retval = -ENOMEM; pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; - retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot, + mkwrite); pte_unmap_unlock(pte, ptl); out: return retval; @@ -2035,12 +2335,10 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, { int err; - if (!page_count(page)) - return -EINVAL; - err = validate_page_before_insert(page); + err = validate_page_before_insert(vma, page); if (err) return err; - return insert_page_into_pte_locked(vma, pte, addr, page, prot); + return insert_page_into_pte_locked(vma, pte, addr, page, prot, false); } /* insert_pages() amortizes the cost of spinlock operations @@ -2143,7 +2441,8 @@ EXPORT_SYMBOL(vm_insert_pages); * @page: source kernel page * * This allows drivers to insert individual pages they've allocated - * into a user vma. + * into a user vma. The zeropage is supported in some VMAs, + * see vm_mixed_zeropage_allowed(). * * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as @@ -2170,14 +2469,12 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, { if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (!page_count(page)) - return -EINVAL; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vm_flags_set(vma, VM_MIXEDMAP); } - return insert_page(vma, addr, page, vma->vm_page_prot); + return insert_page(vma, addr, page, vma->vm_page_prot, false); } EXPORT_SYMBOL(vm_insert_page); @@ -2189,6 +2486,8 @@ EXPORT_SYMBOL(vm_insert_page); * @offset: user's requested vm_pgoff * * This allows drivers to map range of kernel pages into a user vma. + * The zeropage is supported in some VMAs, see + * vm_mixed_zeropage_allowed(). * * Return: 0 on success and error code otherwise. */ @@ -2263,7 +2562,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, EXPORT_SYMBOL(vm_map_pages_zero); static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t prot, bool mkwrite) + unsigned long pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, entry; @@ -2285,7 +2584,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, * allocation and mapping invalidation so just skip the * update. */ - if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) { + if (pte_pfn(entry) != pfn) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry))); goto out_unlock; } @@ -2298,10 +2597,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, } /* Ok, finally just insert the thing.. */ - if (pfn_t_devmap(pfn)) - entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); - else - entry = pte_mkspecial(pfn_t_pte(pfn, prot)); + entry = pte_mkspecial(pfn_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); @@ -2370,10 +2666,9 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); - return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, - false); + return insert_pfn(vma, addr, pfn, pgprot, false); } EXPORT_SYMBOL(vmf_insert_pfn_prot); @@ -2404,34 +2699,35 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vmf_insert_pfn); -static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) +static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn, + bool mkwrite) { + if (unlikely(is_zero_pfn(pfn)) && + (mkwrite || !vm_mixed_zeropage_allowed(vma))) + return false; /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; - if (pfn_t_devmap(pfn)) - return true; - if (pfn_t_special(pfn)) - return true; - if (is_zero_pfn(pfn_t_to_pfn(pfn))) + if (is_zero_pfn(pfn)) return true; return false; } static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn, bool mkwrite) + unsigned long addr, unsigned long pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; int err; - BUG_ON(!vm_mixed_ok(vma, pfn)); + if (!vm_mixed_ok(vma, pfn, mkwrite)) + return VM_FAULT_SIGBUS; if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; - track_pfn_insert(vma, &pgprot, pfn); + pfnmap_setup_cachemode_pfn(pfn, &pgprot); - if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) + if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; /* @@ -2441,8 +2737,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && - !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) { struct page *page; /* @@ -2450,8 +2745,8 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ - page = pfn_to_page(pfn_t_to_pfn(pfn)); - err = insert_page(vma, addr, page, pgprot); + page = pfn_to_page(pfn); + err = insert_page(vma, addr, page, pgprot, mkwrite); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } @@ -2464,8 +2759,28 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page, + bool write) +{ + pgprot_t pgprot = vmf->vma->vm_page_prot; + unsigned long addr = vmf->address; + int err; + + if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end) + return VM_FAULT_SIGBUS; + + err = insert_page(vmf->vma, addr, page, pgprot, write); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite); + vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) + unsigned long pfn) { return __vm_insert_mixed(vma, addr, pfn, false); } @@ -2477,11 +2792,10 @@ EXPORT_SYMBOL(vmf_insert_mixed); * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, - unsigned long addr, pfn_t pfn) + unsigned long addr, unsigned long pfn) { return __vm_insert_mixed(vma, addr, pfn, true); } -EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); /* * maps a range of physical memory into the requested pages. the old @@ -2581,11 +2895,26 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/* - * Variant of remap_pfn_range that does not call track_pfn_remap. The caller - * must have pre-validated the caching bits of the pgprot_t. - */ -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, +static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr, + unsigned long end, unsigned long vm_start, unsigned long vm_end, + unsigned long pfn, pgoff_t *vm_pgoff_p) +{ + /* + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. + */ + if (is_cow_mapping(vm_flags)) { + if (addr != vm_start || end != vm_end) + return -EINVAL; + *vm_pgoff_p = pfn; + } + + return 0; +} + +static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; @@ -2597,31 +2926,7 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; - /* - * Physically remapped pages are special. Tell the - * rest of the world about it: - * VM_IO tells people not to look at these pages - * (accesses can have side effects). - * VM_PFNMAP tells the core MM that the base pages are just - * raw PFN mappings, and do not have a "struct page" associated - * with them. - * VM_DONTEXPAND - * Disable vma merging and expanding with mremap(). - * VM_DONTDUMP - * Omit vma from core dump, even when VM_IO turned off. - * - * There's a horrible special case to handle copy-on-write - * behaviour that some programs depend on. We mark the "original" - * un-COW'ed pages by matching them up with "vma->vm_pgoff". - * See vm_normal_page() for details. - */ - if (is_cow_mapping(vma->vm_flags)) { - if (addr != vma->vm_start || end != vma->vm_end) - return -EINVAL; - vma->vm_pgoff = pfn; - } - - vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) != VM_REMAP_FLAGS); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -2638,6 +2943,134 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, return 0; } +/* + * Variant of remap_pfn_range that does not call track_pfn_remap. The caller + * must have pre-validated the caching bits of the pgprot_t. + */ +static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int error = remap_pfn_range_internal(vma, addr, pfn, size, prot); + + if (!error) + return 0; + + /* + * A partial pfn range mapping is dangerous: it does not + * maintain page reference counts, and callers may free + * pages due to the error. So zap it early. + */ + zap_page_range_single(vma, addr, size, NULL); + return error; +} + +#ifdef __HAVE_PFNMAP_TRACKING +static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn, + unsigned long size, pgprot_t *prot) +{ + struct pfnmap_track_ctx *ctx; + + if (pfnmap_track(pfn, size, prot)) + return ERR_PTR(-EINVAL); + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (unlikely(!ctx)) { + pfnmap_untrack(pfn, size); + return ERR_PTR(-ENOMEM); + } + + ctx->pfn = pfn; + ctx->size = size; + kref_init(&ctx->kref); + return ctx; +} + +void pfnmap_track_ctx_release(struct kref *ref) +{ + struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref); + + pfnmap_untrack(ctx->pfn, ctx->size); + kfree(ctx); +} + +static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + struct pfnmap_track_ctx *ctx = NULL; + int err; + + size = PAGE_ALIGN(size); + + /* + * If we cover the full VMA, we'll perform actual tracking, and + * remember to untrack when the last reference to our tracking + * context from a VMA goes away. We'll keep tracking the whole pfn + * range even during VMA splits and partial unmapping. + * + * If we only cover parts of the VMA, we'll only setup the cachemode + * in the pgprot for the pfn range. + */ + if (addr == vma->vm_start && addr + size == vma->vm_end) { + if (vma->pfnmap_track_ctx) + return -EINVAL; + ctx = pfnmap_track_ctx_alloc(pfn, size, &prot); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + } else if (pfnmap_setup_cachemode(pfn, size, &prot)) { + return -EINVAL; + } + + err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); + if (ctx) { + if (err) + kref_put(&ctx->kref, pfnmap_track_ctx_release); + else + vma->pfnmap_track_ctx = ctx; + } + return err; +} + +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_track(vma, addr, pfn, size, prot); +} +#else +static int do_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return remap_pfn_range_notrack(vma, addr, pfn, size, prot); +} +#endif + +void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ + /* + * We set addr=VMA start, end=VMA end here, so this won't fail, but we + * check it again on complete and will fail there if specified addr is + * invalid. + */ + get_remap_pgoff(desc->vm_flags, desc->start, desc->end, + desc->start, desc->end, pfn, &desc->pgoff); + desc->vm_flags |= VM_REMAP_FLAGS; +} + +static int remap_pfn_range_prepare_vma(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size) +{ + unsigned long end = addr + PAGE_ALIGN(size); + int err; + + err = get_remap_pgoff(vma->vm_flags, addr, end, + vma->vm_start, vma->vm_end, + pfn, &vma->vm_pgoff); + if (err) + return err; + + vm_flags_set(vma, VM_REMAP_FLAGS); + return 0; +} + /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to @@ -2655,17 +3088,20 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, { int err; - err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + err = remap_pfn_range_prepare_vma(vma, addr, pfn, size); if (err) - return -EINVAL; + return err; - err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); - if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); - return err; + return do_remap_pfn_range(vma, addr, pfn, size, prot); } EXPORT_SYMBOL(remap_pfn_range); +int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + return do_remap_pfn_range(vma, addr, pfn, size, prot); +} + /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to @@ -2743,11 +3179,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (fn) { do { if (create || !pte_none(ptep_get(pte))) { - err = fn(pte++, addr, data); + err = fn(pte, addr, data); if (err) break; } - } while (addr += PAGE_SIZE, addr != end); + } while (pte++, addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; @@ -2886,8 +3322,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) continue; - if (WARN_ON_ONCE(pgd_leaf(*pgd))) - return -EINVAL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) { + err = -EINVAL; + break; + } if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { if (!create) continue; @@ -2928,7 +3366,6 @@ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, { return __apply_to_page_range(mm, addr, size, fn, data, false); } -EXPORT_SYMBOL_GPL(apply_to_existing_page_range); /* * handle_pte_fault chooses page fault handler according to an entry which was @@ -2970,10 +3407,8 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, unsigned long addr = vmf->address; if (likely(src)) { - if (copy_mc_user_highpage(dst, src, addr, vma)) { - memory_failure_queue(page_to_pfn(src), 0); + if (copy_mc_user_highpage(dst, src, addr, vma)) return -EHWPOISON; - } return 0; } @@ -3172,6 +3607,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); + VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte))); if (folio) { VM_BUG_ON(folio_test_anon(folio) && @@ -3209,7 +3645,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) } /** - * vmf_anon_prepare - Prepare to handle an anonymous fault. + * __vmf_anon_prepare - Prepare to handle an anonymous fault. * @vmf: The vm_fault descriptor passed from the fault handler. * * When preparing to insert an anonymous page into a VMA from a @@ -3223,7 +3659,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) * Return: 0 if fault handling can proceed. Any other value should be * returned to the caller. */ -vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; @@ -3231,10 +3667,8 @@ vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) if (likely(vma->anon_vma)) return 0; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - if (!mmap_read_trylock(vma->vm_mm)) { - vma_end_read(vma); + if (!mmap_read_trylock(vma->vm_mm)) return VM_FAULT_RETRY; - } } if (__anon_vma_prepare(vma)) ret = VM_FAULT_OOM; @@ -3330,7 +3764,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); - entry = mk_pte(&new_folio->page, vma->vm_page_prot); + entry = folio_mk_pte(new_folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) @@ -3349,7 +3783,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * some TLBs while the old PTE remains in others. */ ptep_clear_flush(vma, vmf->address, vmf->pte); - folio_add_new_anon_rmap(new_folio, vma, vmf->address); + folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); BUG_ON(unshare && pte_write(entry)); set_pte_at(mm, vmf->address, vmf->pte, entry); @@ -3514,19 +3948,86 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) return ret; } -static bool wp_can_reuse_anon_folio(struct folio *folio, - struct vm_area_struct *vma) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static bool __wp_can_reuse_large_anon_folio(struct folio *folio, + struct vm_area_struct *vma) { + bool exclusive = false; + + /* Let's just free up a large folio if only a single page is mapped. */ + if (folio_large_mapcount(folio) <= 1) + return false; + /* - * We could currently only reuse a subpage of a large folio if no - * other subpages of the large folios are still mapped. However, - * let's just consistently not reuse subpages even if we could - * reuse in that scenario, and give back a large folio a bit - * sooner. + * The assumption for anonymous folios is that each page can only get + * mapped once into each MM. The only exception are KSM folios, which + * are always small. + * + * Each taken mapcount must be paired with exactly one taken reference, + * whereby the refcount must be incremented before the mapcount when + * mapping a page, and the refcount must be decremented after the + * mapcount when unmapping a page. + * + * If all folio references are from mappings, and all mappings are in + * the page tables of this MM, then this folio is exclusive to this MM. */ - if (folio_test_large(folio)) + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) return false; + VM_WARN_ON_ONCE(folio_test_ksm(folio)); + + if (unlikely(folio_test_swapcache(folio))) { + /* + * Note: freeing up the swapcache will fail if some PTEs are + * still swap entries. + */ + if (!folio_trylock(folio)) + return false; + folio_free_swap(folio); + folio_unlock(folio); + } + + if (folio_large_mapcount(folio) != folio_ref_count(folio)) + return false; + + /* Stabilize the mapcount vs. refcount and recheck. */ + folio_lock_large_mapcount(folio); + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio); + + if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)) + goto unlock; + if (folio_large_mapcount(folio) != folio_ref_count(folio)) + goto unlock; + + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio); + VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && + folio_mm_id(folio, 1) != vma->vm_mm->mm_id); + + /* + * Do we need the folio lock? Likely not. If there would have been + * references from page migration/swapout, we would have detected + * an additional folio reference and never ended up here. + */ + exclusive = true; +unlock: + folio_unlock_large_mapcount(folio); + return exclusive; +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static bool __wp_can_reuse_large_anon_folio(struct folio *folio, + struct vm_area_struct *vma) +{ + BUILD_BUG(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static bool wp_can_reuse_anon_folio(struct folio *folio, + struct vm_area_struct *vma) +{ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio)) + return __wp_can_reuse_large_anon_folio(folio, vma); + /* * We have to verify under folio lock: these early checks are * just an optimization to avoid locking the folio and freeing @@ -3635,13 +4136,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a - * VM_PFNMAP VMA. + * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ - if (!vmf->page) + if (!vmf->page || is_fsdax_page(vmf->page)) { + vmf->page = NULL; return wp_pfn_shared(vmf); + } return wp_page_shared(vmf, folio); } @@ -3831,7 +4334,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) folio_put(folio); return ret; } - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); @@ -3839,7 +4342,8 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) - restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); + restore_exclusive_pte(vma, folio, vmf->page, vmf->address, + vmf->pte, vmf->orig_pte); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -3863,10 +4367,10 @@ static inline bool should_try_to_free_swap(struct folio *folio, * If we want to map a page that's in the swapcache writable, we * have to detect via the refcount if we're really the exclusive * user. Try freeing the swapcache to get rid of the swapcache - * reference only in case it's likely that we'll be the exlusive user. + * reference only in case it's likely that we'll be the exclusive user. */ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && - folio_ref_count(folio) == 2; + folio_ref_count(folio) == (1 + folio_nr_pages(folio)); } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) @@ -3881,7 +4385,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * * This should also cover the case where e.g. the pte changed * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. - * So is_pte_marker() check is not enough to safely drop the pte. + * So pte_is_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); @@ -3915,8 +4419,8 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) static vm_fault_t handle_pte_marker(struct vm_fault *vmf) { - swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); - unsigned long marker = pte_marker_get(entry); + const softleaf_t entry = softleaf_from_pte(vmf->orig_pte); + const pte_marker marker = softleaf_to_marker(entry); /* * PTE markers should never be empty. If anything weird happened, @@ -3929,13 +4433,183 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) if (marker & PTE_MARKER_POISONED) return VM_FAULT_HWPOISON; - if (pte_marker_entry_uffd_wp(entry)) + /* Hitting a guard page is always a fatal condition. */ + if (marker & PTE_MARKER_GUARD) + return VM_FAULT_SIGSEGV; + + if (softleaf_is_uffd_wp_marker(entry)) return pte_marker_handle_uffd_wp(vmf); /* This is an unknown pte marker */ return VM_FAULT_SIGBUS; } +static struct folio *__alloc_swap_folio(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + softleaf_t entry; + + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); + if (!folio) + return NULL; + + entry = softleaf_from_pte(vmf->orig_pte); + if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, + GFP_KERNEL, entry)) { + folio_put(folio); + return NULL; + } + + return folio; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * Check if the PTEs within a range are contiguous swap entries + * and have consistent swapcache, zeromap. + */ +static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) +{ + unsigned long addr; + softleaf_t entry; + int idx; + pte_t pte; + + addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE); + idx = (vmf->address - addr) / PAGE_SIZE; + pte = ptep_get(ptep); + + if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx))) + return false; + entry = softleaf_from_pte(pte); + if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages) + return false; + + /* + * swap_read_folio() can't handle the case a large folio is hybridly + * from different backends. And they are likely corner cases. Similar + * things might be added once zswap support large folios. + */ + if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages)) + return false; + if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages)) + return false; + + return true; +} + +static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset, + unsigned long addr, + unsigned long orders) +{ + int order, nr; + + order = highest_order(orders); + + /* + * To swap in a THP with nr pages, we require that its first swap_offset + * is aligned with that number, as it was when the THP was swapped out. + * This helps filter out most invalid entries. + */ + while (orders) { + nr = 1 << order; + if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr) + break; + order = next_order(&orders, order); + } + + return orders; +} + +static struct folio *alloc_swap_folio(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long orders; + struct folio *folio; + unsigned long addr; + softleaf_t entry; + spinlock_t *ptl; + pte_t *pte; + gfp_t gfp; + int order; + + /* + * If uffd is active for the vma we need per-page fault fidelity to + * maintain the uffd semantics. + */ + if (unlikely(userfaultfd_armed(vma))) + goto fallback; + + /* + * A large swapped out folio could be partially or fully in zswap. We + * lack handling for such cases, so fallback to swapping in order-0 + * folio. + */ + if (!zswap_never_enabled()) + goto fallback; + + entry = softleaf_from_pte(vmf->orig_pte); + /* + * Get a list of all the (large) orders below PMD_ORDER that are enabled + * and suitable for swapping THP. + */ + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); + orders = thp_vma_suitable_orders(vma, vmf->address, orders); + orders = thp_swap_suitable_orders(swp_offset(entry), + vmf->address, orders); + + if (!orders) + goto fallback; + + pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address & PMD_MASK, &ptl); + if (unlikely(!pte)) + goto fallback; + + /* + * For do_swap_page, find the highest order where the aligned range is + * completely swap entries with contiguous swap offsets. + */ + order = highest_order(orders); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order)) + break; + order = next_order(&orders, order); + } + + pte_unmap_unlock(pte, ptl); + + /* Try allocating the highest of the remaining orders. */ + gfp = vma_thp_gfp_mask(vma); + while (orders) { + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + folio = vma_alloc_folio(gfp, order, vma, addr); + if (folio) { + if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, + gfp, entry)) + return folio; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); + folio_put(folio); + } + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); + order = next_order(&orders, order); + } + +fallback: + return __alloc_swap_folio(vmf); +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static struct folio *alloc_swap_folio(struct vm_fault *vmf) +{ + return __alloc_swap_folio(vmf); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3948,28 +4622,33 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *swapcache, *folio = NULL; + DECLARE_WAITQUEUE(wait, current); struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; bool need_clear_cache = false; bool exclusive = false; - swp_entry_t entry; + softleaf_t entry; pte_t pte; vm_fault_t ret = 0; void *shadow = NULL; + int nr_pages; + unsigned long page_idx; + unsigned long address; + pte_t *ptep; if (!pte_unmap_same(vmf)) goto out; - entry = pte_to_swp_entry(vmf->orig_pte); - if (unlikely(non_swap_entry(entry))) { - if (is_migration_entry(entry)) { + entry = softleaf_from_pte(vmf->orig_pte); + if (unlikely(!softleaf_is_swap(entry))) { + if (softleaf_is_migration(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); - } else if (is_device_exclusive_entry(entry)) { - vmf->page = pfn_swap_entry_to_page(entry); + } else if (softleaf_is_device_exclusive(entry)) { + vmf->page = softleaf_to_page(entry); ret = remove_device_exclusive_entry(vmf); - } else if (is_device_private_entry(entry)) { + } else if (softleaf_is_device_private(entry)) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) { /* * migrate_to_ram is not yet ready to operate @@ -3980,7 +4659,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; } - vmf->page = pfn_swap_entry_to_page(entry); + vmf->page = softleaf_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!vmf->pte || @@ -3992,13 +4671,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * Get a page reference while we know the page can't be * freed. */ - get_page(vmf->page); - pte_unmap_unlock(vmf->pte, vmf->ptl); - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); - put_page(vmf->page); - } else if (is_hwpoison_entry(entry)) { + if (trylock_page(vmf->page)) { + struct dev_pagemap *pgmap; + + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + pgmap = page_pgmap(vmf->page); + ret = pgmap->ops->migrate_to_ram(vmf); + unlock_page(vmf->page); + put_page(vmf->page); + } else { + pte_unmap_unlock(vmf->pte, vmf->ptl); + } + } else if (softleaf_is_hwpoison(entry)) { ret = VM_FAULT_HWPOISON; - } else if (is_pte_marker_entry(entry)) { + } else if (softleaf_is_marker(entry)) { ret = handle_pte_marker(vmf); } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); @@ -4012,45 +4699,46 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - folio = swap_cache_get_folio(entry, vma, vmf->address); + folio = swap_cache_get_folio(entry); if (folio) - page = folio_file_page(folio, swp_offset(entry)); + swap_update_readahead(folio, vma, vmf->address); swapcache = folio; if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { - /* - * Prevent parallel swapin from proceeding with - * the cache flag. Otherwise, another thread may - * finish swapin first, free the entry, and swapout - * reusing the same entry. It's undetectable as - * pte_same() returns true due to entry reuse. - */ - if (swapcache_prepare(entry)) { - /* Relax a bit to prevent rapid repeated page faults */ - schedule_timeout_uninterruptible(1); - goto out; - } - need_clear_cache = true; - /* skip swapcache */ - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, - vma, vmf->address, false); - page = &folio->page; + folio = alloc_swap_folio(vmf); if (folio) { __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_folio(folio, - vma->vm_mm, GFP_KERNEL, - entry)) { - ret = VM_FAULT_OOM; + nr_pages = folio_nr_pages(folio); + if (folio_test_large(folio)) + entry.val = ALIGN_DOWN(entry.val, nr_pages); + /* + * Prevent parallel swapin from proceeding with + * the cache flag. Otherwise, another thread + * may finish swapin first, free the entry, and + * swapout reusing the same entry. It's + * undetectable as pte_same() returns true due + * to entry reuse. + */ + if (swapcache_prepare(entry, nr_pages)) { + /* + * Relax a bit to prevent rapid + * repeated page faults. + */ + add_wait_queue(&swapcache_wq, &wait); + schedule_timeout_uninterruptible(1); + remove_wait_queue(&swapcache_wq, &wait); goto out_page; } - mem_cgroup_swapin_uncharge_swap(entry); + need_clear_cache = true; + + memcg1_swapin(entry, nr_pages); - shadow = get_shadow_from_swap_cache(entry); + shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(folio, shadow); @@ -4058,14 +4746,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* To provide entry to swap_read_folio() */ folio->swap = entry; - swap_read_folio(folio, true, NULL); + swap_read_folio(folio, NULL); folio->private = NULL; } } else { - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - if (page) - folio = page_folio(page); swapcache = folio; } @@ -4086,19 +4772,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); - } else if (PageHWPoison(page)) { - /* - * hwpoisoned dirty swapcache pages are kept for killing - * owner processes (which may be unknown at hwpoison time) - */ - ret = VM_FAULT_HWPOISON; - goto out_release; } ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; + page = folio_file_page(folio, swp_offset(entry)); if (swapcache) { /* * Make sure folio_free_swap() or swapoff did not release the @@ -4107,14 +4787,22 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache, we need to check that the page's swap has not * changed. */ - if (unlikely(!folio_test_swapcache(folio) || - page_swap_entry(page).val != entry.val)) + if (unlikely(!folio_matches_swap_entry(folio, entry))) goto out_page; + if (unlikely(PageHWPoison(page))) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ + ret = VM_FAULT_HWPOISON; + goto out_page; + } + /* * KSM sometimes has to copy on read faults, for example, if - * page->index of !PageKSM() pages would be nonlinear inside the - * anon VMA -- PageKSM() is lost on actual swapout. + * folio->index of non-ksm folios would be nonlinear inside the + * anon VMA -- the ksm flag is lost on actual swapout. */ folio = ksm_might_need_to_copy(folio, vma, vmf->address); if (unlikely(!folio)) { @@ -4155,6 +4843,56 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_nomap; } + /* allocated large folios for SWP_SYNCHRONOUS_IO */ + if (folio_test_large(folio) && !folio_test_swapcache(folio)) { + unsigned long nr = folio_nr_pages(folio); + unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); + unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; + pte_t *folio_ptep = vmf->pte - idx; + pte_t folio_pte = ptep_get(folio_ptep); + + if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || + swap_pte_batch(folio_ptep, nr, folio_pte) != nr) + goto out_nomap; + + page_idx = idx; + address = folio_start; + ptep = folio_ptep; + goto check_folio; + } + + nr_pages = 1; + page_idx = 0; + address = vmf->address; + ptep = vmf->pte; + if (folio_test_large(folio) && folio_test_swapcache(folio)) { + int nr = folio_nr_pages(folio); + unsigned long idx = folio_page_idx(folio, page); + unsigned long folio_start = address - idx * PAGE_SIZE; + unsigned long folio_end = folio_start + nr * PAGE_SIZE; + pte_t *folio_ptep; + pte_t folio_pte; + + if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start))) + goto check_folio; + if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end))) + goto check_folio; + + folio_ptep = vmf->pte - idx; + folio_pte = ptep_get(folio_ptep); + if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || + swap_pte_batch(folio_ptep, nr, folio_pte) != nr) + goto check_folio; + + page_idx = idx; + address = folio_start; + ptep = folio_ptep; + nr_pages = nr; + entry = folio->swap; + page = &folio->page; + } + +check_folio: /* * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte * must never point at an anonymous page in the swapcache that is @@ -4214,13 +4952,17 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * We're already holding a reference on the page but haven't mapped it * yet. */ - swap_free(entry); + swap_free_nr(entry, nr_pages); if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); + add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages); pte = mk_pte(page, vma->vm_page_prot); + if (pte_swp_soft_dirty(vmf->orig_pte)) + pte = pte_mksoft_dirty(pte); + if (pte_swp_uffd_wp(vmf->orig_pte)) + pte = pte_mkuffd_wp(pte); /* * Same logic as in do_wp_page(); however, optimize for pages that are @@ -4230,32 +4972,44 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ if (!folio_test_ksm(folio) && (exclusive || folio_ref_count(folio) == 1)) { - if (vmf->flags & FAULT_FLAG_WRITE) { - pte = maybe_mkwrite(pte_mkdirty(pte), vma); - vmf->flags &= ~FAULT_FLAG_WRITE; + if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) && + !pte_needs_soft_dirty_wp(vma, pte)) { + pte = pte_mkwrite(pte, vma); + if (vmf->flags & FAULT_FLAG_WRITE) { + pte = pte_mkdirty(pte); + vmf->flags &= ~FAULT_FLAG_WRITE; + } } rmap_flags |= RMAP_EXCLUSIVE; } - flush_icache_page(vma, page); - if (pte_swp_soft_dirty(vmf->orig_pte)) - pte = pte_mksoft_dirty(pte); - if (pte_swp_uffd_wp(vmf->orig_pte)) - pte = pte_mkuffd_wp(pte); - vmf->orig_pte = pte; + folio_ref_add(folio, nr_pages - 1); + flush_icache_pages(vma, page, nr_pages); + vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ if (unlikely(folio != swapcache && swapcache)) { - folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); + } else if (!folio_test_anon(folio)) { + /* + * We currently only expect small !anon folios which are either + * fully exclusive or fully shared, or new allocated large + * folios which are fully exclusive. If we ever get large + * folios within swapcache here, we have to be careful. + */ + VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { - folio_add_anon_rmap_pte(folio, page, vma, vmf->address, + folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, rmap_flags); } VM_BUG_ON(!folio_test_anon(folio) || (pte_write(pte) && !PageAnonExclusive(page))); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + set_ptes(vma->vm_mm, address, ptep, pte, nr_pages); + arch_do_swap_page_nr(vma->vm_mm, vma, address, + pte, pte, nr_pages); folio_unlock(folio); if (folio != swapcache && swapcache) { @@ -4279,14 +5033,17 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* No need to invalidate - it was non-present before */ - update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); + update_mmu_cache_range(vmf, vma, address, ptep, nr_pages); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out: /* Clear the swap cache pin for direct swapin after PTL unlock */ - if (need_clear_cache) - swapcache_clear(si, entry); + if (need_clear_cache) { + swapcache_clear(si, entry, nr_pages); + if (waitqueue_active(&swapcache_wq)) + wake_up(&swapcache_wq); + } if (si) put_swap_device(si); return ret; @@ -4301,8 +5058,11 @@ out_release: folio_unlock(swapcache); folio_put(swapcache); } - if (need_clear_cache) - swapcache_clear(si, entry); + if (need_clear_cache) { + swapcache_clear(si, entry, nr_pages); + if (waitqueue_active(&swapcache_wq)) + wake_up(&swapcache_wq); + } if (si) put_swap_device(si); return ret; @@ -4343,8 +5103,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -4376,7 +5136,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr, true); + folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); @@ -4384,7 +5144,15 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) goto next; } folio_throttle_swaprate(folio, gfp); - clear_huge_page(&folio->page, vmf->address, 1 << order); + /* + * When a folio is not zeroed during allocation + * (__GFP_ZERO not used) or user folios require special + * handling, folio_zero_user() is used to make sure + * that the page corresponding to the faulting address + * will be hot in the cache after zeroing. + */ + if (user_alloc_needs_zeroing()) + folio_zero_user(folio, vmf->address); return folio; } next: @@ -4410,7 +5178,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vm_fault_t ret = 0; int nr_pages = 1; pte_t entry; - int i; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) @@ -4468,7 +5235,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) */ __folio_mark_uptodate(folio); - entry = mk_pte(&folio->page, vma->vm_page_prot); + entry = folio_mk_pte(folio, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry), vma); @@ -4480,8 +5247,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) update_mmu_tlb(vma, addr, vmf->pte); goto release; } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { - for (i = 0; i < nr_pages; i++) - update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i); + update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages); goto release; } @@ -4498,10 +5264,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) folio_ref_add(folio, nr_pages - 1); add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC); -#endif - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); setpte: if (vmf_orig_pte_uffd_wp(vmf)) @@ -4541,7 +5305,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) * lock_page(B) * lock_page(B) * pte_alloc_one - * shrink_page_list + * shrink_folio_list * wait_on_page_writeback(A) * SetPageWriteback(B) * unlock_page(B) @@ -4596,15 +5360,25 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { - struct folio *folio = page_folio(page); struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; + /* + * It is too late to allocate a small folio, we already have a large + * folio in the pagecache: especially s390 KVM cannot tolerate any + * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any + * PMD mappings if THPs are disabled. As we already have a THP, + * behave as if we are forcing a collapse. + */ + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + /* forced_collapse=*/ true)) + return ret; + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; @@ -4637,7 +5411,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) flush_icache_pages(vma, page, HPAGE_PMD_NR); - entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = folio_mk_pmd(folio, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -4662,7 +5436,7 @@ out: return ret; } #else -vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page) { return VM_FAULT_FALLBACK; } @@ -4670,7 +5444,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) /** * set_pte_range - Set a range of PTEs to point to pages in a folio. - * @vmf: Fault decription. + * @vmf: Fault description. * @folio: The folio that contains @page. * @page: The first page to create a PTE for. * @nr: The number of PTEs to create. @@ -4681,7 +5455,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; - bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE); + bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; flush_icache_pages(vma, page, nr); @@ -4694,12 +5468,14 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_write(entry) && folio_test_dirty(folio)) + entry = pte_mkdirty(entry); if (unlikely(vmf_orig_pte_uffd_wp(vmf))) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { VM_BUG_ON_FOLIO(nr != 1, folio); - folio_add_new_anon_rmap(folio, vma, addr); + folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else { folio_add_file_rmap_ptes(folio, page, nr, vma); @@ -4737,9 +5513,16 @@ vm_fault_t finish_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; + struct folio *folio; vm_fault_t ret; bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED); + int type, nr_pages; + unsigned long addr; + bool needs_fallback = false; + +fallback: + addr = vmf->address; /* Did we COW the page? */ if (is_cow) @@ -4747,6 +5530,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf) else page = vmf->page; + folio = page_folio(page); /* * check even for read faults because we might have lost our CoWed * page @@ -4757,9 +5541,26 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } + if (!needs_fallback && vma->vm_file) { + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t file_end; + + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + + /* + * Do not allow to map with PTEs beyond i_size and with PMD + * across i_size to preserve SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + needs_fallback = !shmem_mapping(mapping) && + file_end < folio_next_index(folio); + } + if (pmd_none(*vmf->pmd)) { - if (PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); + if (!needs_fallback && folio_test_pmd_mappable(folio)) { + ret = do_set_pmd(vmf, folio, page); if (ret != VM_FAULT_FALLBACK) return ret; } @@ -4770,24 +5571,57 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return VM_FAULT_OOM; } + nr_pages = folio_nr_pages(folio); + + /* Using per-page fault to maintain the uffd semantics */ + if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { + nr_pages = 1; + } else if (nr_pages > 1) { + pgoff_t idx = folio_page_idx(folio, page); + /* The page offset of vmf->address within the VMA. */ + pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff; + /* The index of the entry in the pagetable for fault page. */ + pgoff_t pte_off = pte_index(vmf->address); + + /* + * Fallback to per-page fault in case the folio size in page + * cache beyond the VMA limits and PMD pagetable limits. + */ + if (unlikely(vma_off < idx || + vma_off + (nr_pages - idx) > vma_pages(vma) || + pte_off < idx || + pte_off + (nr_pages - idx) > PTRS_PER_PTE)) { + nr_pages = 1; + } else { + /* Now we can set mappings for the whole large folio. */ + addr = vmf->address - idx * PAGE_SIZE; + page = &folio->page; + } + } + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); + addr, &vmf->ptl); if (!vmf->pte) return VM_FAULT_NOPAGE; /* Re-check under ptl */ - if (likely(!vmf_pte_changed(vmf))) { - struct folio *folio = page_folio(page); - int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); - - set_pte_range(vmf, folio, page, 1, vmf->address); - add_mm_counter(vma->vm_mm, type, 1); - ret = 0; - } else { - update_mmu_tlb(vma, vmf->address, vmf->pte); + if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) { + update_mmu_tlb(vma, addr, vmf->pte); ret = VM_FAULT_NOPAGE; + goto unlock; + } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) { + needs_fallback = true; + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto fallback; } + folio_ref_add(folio, nr_pages - 1); + set_pte_range(vmf, folio, page, nr_pages, addr); + type = is_cow ? MM_ANONPAGES : mm_counter_file(folio); + add_mm_counter(vma->vm_mm, type, nr_pages); + ret = 0; + +unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } @@ -4954,10 +5788,14 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) if (ret & VM_FAULT_DONE_COW) return ret; - copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) { + ret = VM_FAULT_HWPOISON; + goto unlock; + } __folio_mark_uptodate(folio); ret |= finish_fault(vmf); +unlock: unlock_page(vmf->page); put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -5062,18 +5900,46 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf, - unsigned long addr, int page_nid, int *flags) +int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, + unsigned long addr, int *flags, + bool writable, int *last_cpupid) { struct vm_area_struct *vma = vmf->vma; - folio_get(folio); + /* + * Avoid grouping on RO pages in general. RO pages shouldn't hurt as + * much anyway since they can be in shared cache state. This misses + * the case where a mapping is writable but the process never writes + * to it but pte_write gets cleared during protection updates and + * pte_dirty has unpredictable behaviour between PTE scan updates, + * background writeback, dirty balancing and application behaviour. + */ + if (!writable) + *flags |= TNF_NO_GROUP; + + /* + * Flag if the folio is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) + *flags |= TNF_SHARED; + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if (folio_use_access_time(folio)) + *last_cpupid = (-1 & LAST_CPUPID_MASK); + else + *last_cpupid = folio_last_cpupid(folio); /* Record the current PID acceesing VMA */ vma_set_access_pid_bit(vma); count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == numa_node_id()) { +#ifdef CONFIG_NUMA_BALANCING + count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1); +#endif + if (folio_nid(folio) == numa_node_id()) { count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); *flags |= TNF_FAULT_LOCAL; } @@ -5157,7 +6023,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; + return 0; } pte = pte_modify(old_pte, vma->vm_page_prot); @@ -5175,65 +6041,39 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (!folio || folio_is_zone_device(folio)) goto out_map; - /* - * Avoid grouping on RO pages in general. RO pages shouldn't hurt as - * much anyway since they can be in shared cache state. This misses - * the case where a mapping is writable but the process never writes - * to it but pte_write gets cleared during protection updates and - * pte_dirty has unpredictable behaviour between PTE scan updates, - * background writeback, dirty balancing and application behaviour. - */ - if (!writable) - flags |= TNF_NO_GROUP; - - /* - * Flag if the folio is shared between multiple address spaces. This - * is later used when determining whether to group tasks together - */ - if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED)) - flags |= TNF_SHARED; - nid = folio_nid(folio); nr_pages = folio_nr_pages(folio); - /* - * For memory tiering mode, cpupid of slow memory page is used - * to record page access time. So use default value. - */ - if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && - !node_is_toptier(nid)) - last_cpupid = (-1 & LAST_CPUPID_MASK); - else - last_cpupid = folio_last_cpupid(folio); - target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags); - if (target_nid == NUMA_NO_NODE) { - folio_put(folio); + + target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags, + writable, &last_cpupid); + if (target_nid == NUMA_NO_NODE) + goto out_map; + if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) { + flags |= TNF_MIGRATE_FAIL; goto out_map; } + /* The folio is isolated and isolation code holds a folio reference. */ pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; ignore_writable = true; /* Migrate to the requested node */ - if (migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; - } else { - flags |= TNF_MIGRATE_FAIL; - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - if (unlikely(!vmf->pte)) - goto out; - if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; - } - goto out_map; + task_numa_fault(last_cpupid, nid, nr_pages, flags); + return 0; } -out: - if (nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, nid, nr_pages, flags); - return 0; + flags |= TNF_MIGRATE_FAIL; + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!vmf->pte)) + return 0; + if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } out_map: /* * Make it present again, depending on how arch implements @@ -5246,7 +6086,10 @@ out_map: numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, writable); pte_unmap_unlock(vmf->pte, vmf->ptl); - goto out; + + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, nr_pages, flags); + return 0; } static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) @@ -5286,7 +6129,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) split: /* COW or write-notify handled on pte level: split pmd. */ - __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false); return VM_FAULT_FALLBACK; } @@ -5330,6 +6173,45 @@ split: } /* + * The page faults may be spurious because of the racy access to the + * page table. For example, a non-populated virtual page is accessed + * on 2 CPUs simultaneously, thus the page faults are triggered on + * both CPUs. However, it's possible that one CPU (say CPU A) cannot + * find the reason for the page fault if the other CPU (say CPU B) has + * changed the page table before the PTE is checked on CPU A. Most of + * the time, the spurious page faults can be ignored safely. However, + * if the page fault is for the write access, it's possible that a + * stale read-only TLB entry exists in the local CPU and needs to be + * flushed on some architectures. This is called the spurious page + * fault fixing. + * + * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page() + * by default and used as such on most architectures, while + * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and + * used as such on most architectures. + */ +static void fix_spurious_fault(struct vm_fault *vmf, + enum pgtable_level ptlevel) +{ + /* Skip spurious TLB flush for retried page fault */ + if (vmf->flags & FAULT_FLAG_TRIED) + return; + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (vmf->flags & FAULT_FLAG_WRITE) { + if (ptlevel == PGTABLE_LEVEL_PTE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, + vmf->pte); + else + flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address, + vmf->pmd); + } +} +/* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. @@ -5358,14 +6240,24 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) vmf->pte = NULL; vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; } else { + pmd_t dummy_pmdval; + /* * A regular pmd is established and it can't morph into a huge * pmd by anon khugepaged, since that takes mmap_lock in write * mode; but shmem or file collapse to THP could still morph * it into a huge pmd: just retry later if so. + * + * Use the maywrite version to indicate that vmf->pte may be + * modified, but since we will use pte_same() to detect the + * change of the !pte_none() entry, there is no need to recheck + * the pmdval. Here we chooes to pass a dummy variable instead + * of NULL, which helps new user think about why this place is + * special. */ - vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); + vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &dummy_pmdval, + &vmf->ptl); if (unlikely(!vmf->pte)) return 0; vmf->orig_pte = ptep_get_lockless(vmf->pte); @@ -5400,23 +6292,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, - vmf->flags & FAULT_FLAG_WRITE)) { + vmf->flags & FAULT_FLAG_WRITE)) update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte, 1); - } else { - /* Skip spurious TLB flush for retried page fault */ - if (vmf->flags & FAULT_FLAG_TRIED) - goto unlock; - /* - * This is needed only for protection faults but the arch code - * is not yet telling us if this is a protection fault or not. - * This still avoids useless tlb flushes for .text page faults - * with threads. - */ - if (vmf->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(vmf->vma, vmf->address, - vmf->pte); - } + else + fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -5440,7 +6320,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; @@ -5455,8 +6335,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5464,7 +6343,7 @@ retry_pud: pud_t orig_pud = *vmf.pud; barrier(); - if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + if (pud_trans_huge(orig_pud)) { /* * TODO once we support anonymous PUDs: NUMA case and @@ -5490,37 +6369,45 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) + if (ret & VM_FAULT_FALLBACK) + goto fallback; + else return ret; - } else { - vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); + } - if (unlikely(is_swap_pmd(vmf.orig_pmd))) { - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(vmf.orig_pmd)); - if (is_pmd_migration_entry(vmf.orig_pmd)) - pmd_migration_entry_wait(mm, vmf.pmd); - return 0; - } - if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { - if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&vmf); + vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); + if (pmd_none(vmf.orig_pmd)) + goto fallback; - if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !pmd_write(vmf.orig_pmd)) { - ret = wp_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; - } else { - huge_pmd_set_accessed(&vmf); - return 0; - } + if (unlikely(!pmd_present(vmf.orig_pmd))) { + if (pmd_is_device_private_entry(vmf.orig_pmd)) + return do_huge_pmd_device_private(&vmf); + + if (pmd_is_migration_entry(vmf.orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } + if (pmd_trans_huge(vmf.orig_pmd)) { + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + return do_huge_pmd_numa_page(&vmf); + + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !pmd_write(vmf.orig_pmd)) { + ret = wp_huge_pmd(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + vmf.ptl = pmd_lock(mm, vmf.pmd); + if (!huge_pmd_set_accessed(&vmf)) + fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD); + spin_unlock(vmf.ptl); + return 0; } } +fallback: return handle_pte_fault(&vmf); } @@ -5649,7 +6536,8 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, } /* - * By the time we get here, we already hold the mm semaphore + * By the time we get here, we already hold either the VMA lock or the + * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which). * * The mmap_lock may have been released depending on flags and our * return value. See filemap_fault() and __folio_lock_or_retry(). @@ -5660,6 +6548,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, /* If the fault handler drops the mmap_lock, vma may be freed */ struct mm_struct *mm = vma->vm_mm; vm_fault_t ret; + bool is_droppable; __set_current_state(TASK_RUNNING); @@ -5674,6 +6563,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, goto out; } + is_droppable = !!(vma->vm_flags & VM_DROPPABLE); + /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. @@ -5688,8 +6579,18 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, else ret = __handle_mm_fault(vma, address, flags); + /* + * Warning: It is no longer safe to dereference vma-> after this point, + * because mmap_lock might have been dropped by __handle_mm_fault(), so + * vma might be destroyed from underneath us. + */ + lru_gen_exit_fault(); + /* If the mapping is droppable, then errors due to OOM aren't fatal. */ + if (is_droppable) + ret &= ~VM_FAULT_OOM; + if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* @@ -5708,167 +6609,6 @@ out: } EXPORT_SYMBOL_GPL(handle_mm_fault); -#ifdef CONFIG_LOCK_MM_AND_FIND_VMA -#include <linux/extable.h> - -static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - if (likely(mmap_read_trylock(mm))) - return true; - - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - - return !mmap_read_lock_killable(mm); -} - -static inline bool mmap_upgrade_trylock(struct mm_struct *mm) -{ - /* - * We don't have this operation yet. - * - * It should be easy enough to do: it's basically a - * atomic_long_try_cmpxchg_acquire() - * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but - * it also needs the proper lockdep magic etc. - */ - return false; -} - -static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) -{ - mmap_read_unlock(mm); - if (regs && !user_mode(regs)) { - unsigned long ip = exception_ip(regs); - if (!search_exception_tables(ip)) - return false; - } - return !mmap_write_lock_killable(mm); -} - -/* - * Helper for page fault handling. - * - * This is kind of equivalend to "mmap_read_lock()" followed - * by "find_extend_vma()", except it's a lot more careful about - * the locking (and will drop the lock on failure). - * - * For example, if we have a kernel bug that causes a page - * fault, we don't want to just use mmap_read_lock() to get - * the mm lock, because that would deadlock if the bug were - * to happen while we're holding the mm lock for writing. - * - * So this checks the exception tables on kernel faults in - * order to only do this all for instructions that are actually - * expected to fault. - * - * We can also actually take the mm lock for writing if we - * need to extend the vma, which helps the VM layer a lot. - */ -struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, - unsigned long addr, struct pt_regs *regs) -{ - struct vm_area_struct *vma; - - if (!get_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (likely(vma && (vma->vm_start <= addr))) - return vma; - - /* - * Well, dang. We might still be successful, but only - * if we can extend a vma to do so. - */ - if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { - mmap_read_unlock(mm); - return NULL; - } - - /* - * We can try to upgrade the mmap lock atomically, - * in which case we can continue to use the vma - * we already looked up. - * - * Otherwise we'll have to drop the mmap lock and - * re-take it, and also look up the vma again, - * re-checking it. - */ - if (!mmap_upgrade_trylock(mm)) { - if (!upgrade_mmap_lock_carefully(mm, regs)) - return NULL; - - vma = find_vma(mm, addr); - if (!vma) - goto fail; - if (vma->vm_start <= addr) - goto success; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto fail; - } - - if (expand_stack_locked(vma, addr)) - goto fail; - -success: - mmap_write_downgrade(mm); - return vma; - -fail: - mmap_write_unlock(mm); - return NULL; -} -#endif - -#ifdef CONFIG_PER_VMA_LOCK -/* - * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be - * stable and not isolated. If the VMA is not found or is being modified the - * function returns NULL. - */ -struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, - unsigned long address) -{ - MA_STATE(mas, &mm->mm_mt, address, address); - struct vm_area_struct *vma; - - rcu_read_lock(); -retry: - vma = mas_walk(&mas); - if (!vma) - goto inval; - - if (!vma_start_read(vma)) - goto inval; - - /* Check since vm_start/vm_end might change before we lock the VMA */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; - - /* Check if the VMA got isolated after we found it */ - if (vma->detached) { - vma_end_read(vma); - count_vm_vma_lock_event(VMA_LOCK_MISS); - /* The area was replaced with another one */ - goto retry; - } - - rcu_read_unlock(); - return vma; - -inval_end_read: - vma_end_read(vma); -inval: - rcu_read_unlock(); - count_vm_vma_lock_event(VMA_LOCK_ABORT); - return NULL; -} -#endif /* CONFIG_PER_VMA_LOCK */ - #ifndef __PAGETABLE_P4D_FOLDED /* * Allocate p4d page table. @@ -5940,78 +6680,157 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ +static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, + spinlock_t *lock, pte_t *ptep, + pgprot_t pgprot, unsigned long pfn_base, + unsigned long addr_mask, bool writable, + bool special) +{ + args->lock = lock; + args->ptep = ptep; + args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); + args->addr_mask = addr_mask; + args->pgprot = pgprot; + args->writable = writable; + args->special = special; +} + +static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) +{ +#ifdef CONFIG_LOCKDEP + struct file *file = vma->vm_file; + struct address_space *mapping = file ? file->f_mapping : NULL; + + if (mapping) + lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) || + lockdep_is_held(&vma->vm_mm->mmap_lock)); + else + lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); +#endif +} + /** - * follow_pte - look up PTE at a user virtual address - * @vma: the memory mapping - * @address: user virtual address - * @ptepp: location to store found PTE - * @ptlp: location to store the lock for the PTE + * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address + * @args: Pointer to struct @follow_pfnmap_args * - * On a successful return, the pointer to the PTE is stored in @ptepp; - * the corresponding lock is taken and its location is stored in @ptlp. + * The caller needs to setup args->vma and args->address to point to the + * virtual address as the target of such lookup. On a successful return, + * the results will be put into other output fields. * - * The contents of the PTE are only stable until @ptlp is released using - * pte_unmap_unlock(). This function will fail if the PTE is non-present. - * Present PTEs may include PTEs that map refcounted pages, such as - * anonymous folios in COW mappings. + * After the caller finished using the fields, the caller must invoke + * another follow_pfnmap_end() to proper releases the locks and resources + * of such look up request. * - * Callers must be careful when relying on PTE content after - * pte_unmap_unlock(). Especially if the PTE maps a refcounted page, - * callers must protect against invalidation with MMU notifiers; otherwise - * access to the PFN at a later point in time can trigger use-after-free. + * During the start() and end() calls, the results in @args will be valid + * as proper locks will be held. After the end() is called, all the fields + * in @follow_pfnmap_args will be invalid to be further accessed. Further + * use of such information after end() may require proper synchronizations + * by the caller with page table updates, otherwise it can create a + * security bug. + * + * If the PTE maps a refcounted page, callers are responsible to protect + * against invalidation with MMU notifiers; otherwise access to the PFN at + * a later point in time can trigger use-after-free. * * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore - * should be taken for read. + * should be taken for read, and the mmap semaphore cannot be released + * before the end() is invoked. * * This function must not be used to modify PTE content. * - * Return: zero on success, -ve otherwise. + * Return: zero on success, negative otherwise. */ -int follow_pte(struct vm_area_struct *vma, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +int follow_pfnmap_start(struct follow_pfnmap_args *args) { + struct vm_area_struct *vma = args->vma; + unsigned long address = args->address; struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; + spinlock_t *lock; + pgd_t *pgdp; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + pfnmap_lockdep_assert(vma); - mmap_assert_locked(mm); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) goto out; if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; - - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) +retry: + pgdp = pgd_offset(mm, address); + if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp))) goto out; - p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + p4dp = p4d_offset(pgdp, address); + p4d = p4dp_get(p4dp); + if (p4d_none(p4d) || unlikely(p4d_bad(p4d))) goto out; - pud = pud_offset(p4d, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) + pudp = pud_offset(p4dp, address); + pud = pudp_get(pudp); + if (pud_none(pud)) goto out; + if (pud_leaf(pud)) { + lock = pud_lock(mm, pudp); + if (!unlikely(pud_leaf(pud))) { + spin_unlock(lock); + goto retry; + } + pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud), + pud_pfn(pud), PUD_MASK, pud_write(pud), + pud_special(pud)); + return 0; + } - pmd = pmd_offset(pud, address); - VM_BUG_ON(pmd_trans_huge(*pmd)); + pmdp = pmd_offset(pudp, address); + pmd = pmdp_get_lockless(pmdp); + if (pmd_leaf(pmd)) { + lock = pmd_lock(mm, pmdp); + if (!unlikely(pmd_leaf(pmd))) { + spin_unlock(lock); + goto retry; + } + pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd), + pmd_pfn(pmd), PMD_MASK, pmd_write(pmd), + pmd_special(pmd)); + return 0; + } - ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + ptep = pte_offset_map_lock(mm, pmdp, address, &lock); if (!ptep) goto out; - if (!pte_present(ptep_get(ptep))) + pte = ptep_get(ptep); + if (!pte_present(pte)) goto unlock; - *ptepp = ptep; + pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte), + pte_pfn(pte), PAGE_MASK, pte_write(pte), + pte_special(pte)); return 0; unlock: - pte_unmap_unlock(ptep, *ptlp); + pte_unmap_unlock(ptep, lock); out: return -EINVAL; } -EXPORT_SYMBOL_GPL(follow_pte); +EXPORT_SYMBOL_GPL(follow_pfnmap_start); + +/** + * follow_pfnmap_end(): End a follow_pfnmap_start() process + * @args: Pointer to struct @follow_pfnmap_args + * + * Must be used in pair of follow_pfnmap_start(). See the start() function + * above for more information. + */ +void follow_pfnmap_end(struct follow_pfnmap_args *args) +{ + if (args->lock) + spin_unlock(args->lock); + if (args->ptep) + pte_unmap(args->ptep); +} +EXPORT_SYMBOL_GPL(follow_pfnmap_end); #ifdef CONFIG_HAVE_IOREMAP_PROT /** @@ -6030,36 +6849,36 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; - unsigned long prot = 0; + pgprot_t prot = __pgprot(0); void __iomem *maddr; - pte_t *ptep, pte; - spinlock_t *ptl; int offset = offset_in_page(addr); int ret = -EINVAL; + bool writable; + struct follow_pfnmap_args args = { .vma = vma, .address = addr }; retry: - if (follow_pte(vma, addr, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) return -EINVAL; - pte = ptep_get(ptep); - pte_unmap_unlock(ptep, ptl); + prot = args.pgprot; + phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; + writable = args.writable; + follow_pfnmap_end(&args); - prot = pgprot_val(pte_pgprot(pte)); - phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - - if ((write & FOLL_WRITE) && !pte_write(pte)) + if ((write & FOLL_WRITE) && !writable) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; - if (follow_pte(vma, addr, &ptep, &ptl)) + if (follow_pfnmap_start(&args)) goto out_unmap; - if (!pte_same(pte, ptep_get(ptep))) { - pte_unmap_unlock(ptep, ptl); + if ((pgprot_val(prot) != pgprot_val(args.pgprot)) || + (phys_addr != (args.pfn << PAGE_SHIFT)) || + (writable != args.writable)) { + follow_pfnmap_end(&args); iounmap(maddr); - goto retry; } @@ -6068,7 +6887,7 @@ retry: else memcpy_fromio(buf, maddr + offset, len); ret = len; - pte_unmap_unlock(ptep, ptl); + follow_pfnmap_end(&args); out_unmap: iounmap(maddr); @@ -6100,6 +6919,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, while (len) { int bytes, offset; void *maddr; + struct folio *folio; struct vm_area_struct *vma = NULL; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); @@ -6131,21 +6951,22 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, if (bytes <= 0) break; } else { + folio = page_folio(page); bytes = len; offset = addr & (PAGE_SIZE-1); if (bytes > PAGE_SIZE-offset) bytes = PAGE_SIZE-offset; - maddr = kmap_local_page(page); + maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE); if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); - set_page_dirty_lock(page); + folio_mark_dirty_lock(folio); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); } - unmap_and_put_page(page, maddr); + folio_release_kmap(folio, maddr); } len -= bytes; buf += bytes; @@ -6197,6 +7018,126 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, } EXPORT_SYMBOL_GPL(access_process_vm); +#ifdef CONFIG_BPF_SYSCALL +/* + * Copy a string from another process's address space as given in mm. + * If there is any error return -EFAULT. + */ +static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + void *old_buf = buf; + int err = 0; + + *(char *)buf = '\0'; + + if (mmap_read_lock_killable(mm)) + return -EFAULT; + + addr = untagged_addr_remote(mm, addr); + + /* Avoid triggering the temporary warning in __get_user_pages */ + if (!vma_lookup(mm, addr)) { + err = -EFAULT; + goto out; + } + + while (len) { + int bytes, offset, retval; + void *maddr; + struct folio *folio; + struct page *page; + struct vm_area_struct *vma = NULL; + + page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); + if (IS_ERR(page)) { + /* + * Treat as a total failure for now until we decide how + * to handle the CONFIG_HAVE_IOREMAP_PROT case and + * stack expansion. + */ + *(char *)buf = '\0'; + err = -EFAULT; + goto out; + } + + folio = page_folio(page); + bytes = len; + offset = addr & (PAGE_SIZE - 1); + if (bytes > PAGE_SIZE - offset) + bytes = PAGE_SIZE - offset; + + maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE); + retval = strscpy(buf, maddr + offset, bytes); + if (retval >= 0) { + /* Found the end of the string */ + buf += retval; + folio_release_kmap(folio, maddr); + break; + } + + buf += bytes - 1; + /* + * Because strscpy always NUL terminates we need to + * copy the last byte in the page if we are going to + * load more pages + */ + if (bytes != len) { + addr += bytes - 1; + copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1); + buf += 1; + addr += 1; + } + len -= bytes; + + folio_release_kmap(folio, maddr); + } + +out: + mmap_read_unlock(mm); + if (err) + return err; + return buf - old_buf; +} + +/** + * copy_remote_vm_str - copy a string from another process's address space. + * @tsk: the task of the target address space + * @addr: start address to read from + * @buf: destination buffer + * @len: number of bytes to copy + * @gup_flags: flags modifying lookup behaviour + * + * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from @addr (source) to @buf (destination); + * not including the trailing NUL. Always guaranteed to leave NUL-terminated + * buffer. On any error, return -EFAULT. + */ +int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + if (unlikely(len == 0)) + return 0; + + mm = get_task_mm(tsk); + if (!mm) { + *(char *)buf = '\0'; + return -EFAULT; + } + + ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags); + + mmput(mm); + + return ret; +} +EXPORT_SYMBOL_GPL(copy_remote_vm_str); +#endif /* CONFIG_BPF_SYSCALL */ + /* * Print the name of a VMA. */ @@ -6229,10 +7170,8 @@ void __might_fault(const char *file, int line) if (pagefault_disabled()) return; __might_sleep(file, line); -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); -#endif } EXPORT_SYMBOL(__might_fault); #endif @@ -6244,23 +7183,23 @@ EXPORT_SYMBOL(__might_fault); * cache lines hot. */ static inline int process_huge_page( - unsigned long addr_hint, unsigned int pages_per_huge_page, + unsigned long addr_hint, unsigned int nr_pages, int (*process_subpage)(unsigned long addr, int idx, void *arg), void *arg) { int i, n, base, l, ret; unsigned long addr = addr_hint & - ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + ~(((unsigned long)nr_pages << PAGE_SHIFT) - 1); /* Process target subpage last to keep its cache lines hot */ might_sleep(); n = (addr_hint - addr) / PAGE_SIZE; - if (2 * n <= pages_per_huge_page) { + if (2 * n <= nr_pages) { /* If target subpage in first half of huge page */ base = 0; l = n; /* Process subpages at the end of huge page */ - for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { + for (i = nr_pages - 1; i >= 2 * n; i--) { cond_resched(); ret = process_subpage(addr + i * PAGE_SIZE, i, arg); if (ret) @@ -6268,8 +7207,8 @@ static inline int process_huge_page( } } else { /* If target subpage in second half of huge page */ - base = pages_per_huge_page - 2 * (pages_per_huge_page - n); - l = pages_per_huge_page - n; + base = nr_pages - 2 * (nr_pages - n); + l = nr_pages - n; /* Process subpages at the begin of huge page */ for (i = 0; i < base; i++) { cond_resched(); @@ -6298,102 +7237,95 @@ static inline int process_huge_page( return 0; } -static void clear_gigantic_page(struct page *page, - unsigned long addr, - unsigned int pages_per_huge_page) +static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint, + unsigned int nr_pages) { + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio)); int i; - struct page *p; might_sleep(); - for (i = 0; i < pages_per_huge_page; i++) { - p = nth_page(page, i); + for (i = 0; i < nr_pages; i++) { cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); + clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE); } } static int clear_subpage(unsigned long addr, int idx, void *arg) { - struct page *page = arg; + struct folio *folio = arg; - clear_user_highpage(nth_page(page, idx), addr); + clear_user_highpage(folio_page(folio, idx), addr); return 0; } -void clear_huge_page(struct page *page, - unsigned long addr_hint, unsigned int pages_per_huge_page) +/** + * folio_zero_user - Zero a folio which will be mapped to userspace. + * @folio: The folio to zero. + * @addr_hint: The address will be accessed or the base address if uncelar. + */ +void folio_zero_user(struct folio *folio, unsigned long addr_hint) { - unsigned long addr = addr_hint & - ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); - - if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, pages_per_huge_page); - return; - } + unsigned int nr_pages = folio_nr_pages(folio); - process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page); + if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) + clear_gigantic_page(folio, addr_hint, nr_pages); + else + process_huge_page(addr_hint, nr_pages, clear_subpage, folio); } static int copy_user_gigantic_page(struct folio *dst, struct folio *src, - unsigned long addr, - struct vm_area_struct *vma, - unsigned int pages_per_huge_page) + unsigned long addr_hint, + struct vm_area_struct *vma, + unsigned int nr_pages) { - int i; + unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst)); struct page *dst_page; struct page *src_page; + int i; - for (i = 0; i < pages_per_huge_page; i++) { + for (i = 0; i < nr_pages; i++) { dst_page = folio_page(dst, i); src_page = folio_page(src, i); cond_resched(); if (copy_mc_user_highpage(dst_page, src_page, - addr + i*PAGE_SIZE, vma)) { - memory_failure_queue(page_to_pfn(src_page), 0); + addr + i*PAGE_SIZE, vma)) return -EHWPOISON; - } } return 0; } struct copy_subpage_arg { - struct page *dst; - struct page *src; + struct folio *dst; + struct folio *src; struct vm_area_struct *vma; }; static int copy_subpage(unsigned long addr, int idx, void *arg) { struct copy_subpage_arg *copy_arg = arg; - struct page *dst = nth_page(copy_arg->dst, idx); - struct page *src = nth_page(copy_arg->src, idx); + struct page *dst = folio_page(copy_arg->dst, idx); + struct page *src = folio_page(copy_arg->src, idx); - if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) { - memory_failure_queue(page_to_pfn(src), 0); + if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) return -EHWPOISON; - } return 0; } int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma) { - unsigned int pages_per_huge_page = folio_nr_pages(dst); - unsigned long addr = addr_hint & - ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + unsigned int nr_pages = folio_nr_pages(dst); struct copy_subpage_arg arg = { - .dst = &dst->page, - .src = &src->page, + .dst = dst, + .src = src, .vma = vma, }; - if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) - return copy_user_gigantic_page(dst, src, addr, vma, - pages_per_huge_page); + if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) + return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages); - return process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg); + return process_huge_page(addr_hint, nr_pages, copy_subpage, &arg); } long copy_folio_from_user(struct folio *dst_folio, @@ -6428,7 +7360,7 @@ long copy_folio_from_user(struct folio *dst_folio, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS static struct kmem_cache *page_ptl_cachep; @@ -6451,7 +7383,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc) void ptlock_free(struct ptdesc *ptdesc) { - kmem_cache_free(page_ptl_cachep, ptdesc->ptl); + if (ptdesc->ptl) + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif |
