diff options
Diffstat (limited to 'mm/pagewalk.c')
| -rw-r--r-- | mm/pagewalk.c | 172 |
1 files changed, 112 insertions, 60 deletions
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index e478777c86e1..90cc346a6ecf 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -5,7 +5,7 @@ #include <linux/hugetlb.h> #include <linux/mmu_context.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <asm/tlbflush.h> @@ -143,8 +143,7 @@ again: * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ - if (pmd_present(*pmd) && - (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + if (pmd_present(*pmd) && pmd_trans_huge(*pmd)) continue; } @@ -210,8 +209,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ - if (pud_present(*pud) && - (pud_trans_huge(*pud) || pud_devmap(*pud))) + if (pud_present(*pud) && pud_trans_huge(*pud)) continue; } @@ -422,7 +420,7 @@ static inline void process_mm_walk_lock(struct mm_struct *mm, { if (walk_lock == PGWALK_RDLOCK) mmap_assert_locked(mm); - else + else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY) mmap_assert_write_locked(mm); } @@ -437,6 +435,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, case PGWALK_WRLOCK_VERIFY: vma_assert_write_locked(vma); break; + case PGWALK_VMA_RDLOCK_VERIFY: + vma_assert_locked(vma); + break; case PGWALK_RDLOCK: /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ break; @@ -451,7 +452,7 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, * We usually restrict the ability to install PTEs, but this functionality is * available to internal memory management code and provided in mm/internal.h. */ -int walk_page_range_mm(struct mm_struct *mm, unsigned long start, +int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { @@ -517,10 +518,10 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, * This check is performed on all functions which are parameterised by walk * operations and exposed in include/linux/pagewalk.h. * - * Internal memory management code can use the walk_page_range_mm() function to - * be able to use all page walking operations. + * Internal memory management code can use *_unsafe() functions to be able to + * use all page walking operations. */ -static bool check_ops_valid(const struct mm_walk_ops *ops) +static bool check_ops_safe(const struct mm_walk_ops *ops) { /* * The installation of PTEs is solely under the control of memory @@ -578,15 +579,14 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; - return walk_page_range_mm(mm, start, end, ops, private); + return walk_page_range_mm_unsafe(mm, start, end, ops, private); } /** - * walk_page_range_novma - walk a range of pagetables not backed by a vma - * @mm: mm_struct representing the target process of page table walk + * walk_kernel_page_table_range - walk a range of kernel pagetables. * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk @@ -596,17 +596,73 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for - * walking the kernel pages tables or page tables for firmware. + * walking kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to * take other effective approaches (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ -int walk_page_range_novma(struct mm_struct *mm, unsigned long start, +int walk_kernel_page_table_range(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ + /* + * Kernel intermediate page tables are usually not freed, so the mmap + * read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + mmap_assert_locked(&init_mm); + + return walk_kernel_page_table_range_lockless(start, end, ops, pgd, + private); +} + +/* + * Use this function to walk the kernel page tables locklessly. It should be + * guaranteed that the caller has exclusive access over the range they are + * operating on - that there should be no concurrent access, for example, + * changing permissions for vmalloc objects. + */ +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ + struct mm_walk walk = { + .ops = ops, + .mm = &init_mm, + .pgd = pgd, + .private = private, + .no_vma = true + }; + + if (start >= end) + return -EINVAL; + if (!check_ops_safe(ops)) + return -EINVAL; + + return walk_pgd_range(start, end, &walk); +} + +/** + * walk_page_range_debug - walk a range of pagetables not backed by a vma + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @pgd: pgd to walk if different from mm->pgd + * @private: private data for callbacks' usage + * + * Similar to walk_page_range() but can walk any page tables even if they are + * not backed by VMAs. Because 'unusual' entries may be walked this function + * will also not lock the PTEs for the pte_entry() callback. + * + * This is for debugging purposes ONLY. + */ +int walk_page_range_debug(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, - pgd_t *pgd, - void *private) + pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, @@ -616,41 +672,30 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, .no_vma = true }; + /* For convenience, we allow traversal of kernel mappings. */ + if (mm == &init_mm) + return walk_kernel_page_table_range(start, end, ops, + pgd, private); if (start >= end || !walk.mm) return -EINVAL; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; /* - * 1) For walking the user virtual address space: - * * The mmap lock protects the page walker from changes to the page * tables during the walk. However a read lock is insufficient to * protect those areas which don't have a VMA as munmap() detaches * the VMAs before downgrading to a read lock and actually tearing * down PTEs/page tables. In which case, the mmap write lock should - * be hold. - * - * 2) For walking the kernel virtual address space: - * - * The kernel intermediate page tables usually do not be freed, so - * the mmap map read lock is sufficient. But there are some exceptions. - * E.g. memory hot-remove. In which case, the mmap lock is insufficient - * to prevent the intermediate kernel pages tables belonging to the - * specified address range from being freed. The caller should take - * other actions to prevent this race. + * be held. */ - if (mm == &init_mm) - mmap_assert_locked(walk.mm); - else - mmap_assert_write_locked(walk.mm); + mmap_assert_write_locked(mm); return walk_pgd_range(start, end, &walk); } -int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, - unsigned long end, const struct mm_walk_ops *ops, - void *private) +int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, @@ -663,14 +708,22 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, return -EINVAL; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - if (!check_ops_valid(ops)) - return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(start, end, &walk); } +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + if (!check_ops_safe(ops)) + return -EINVAL; + + return walk_page_range_vma_unsafe(vma, start, end, ops, private); +} + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { @@ -683,7 +736,7 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, if (!walk.mm) return -EINVAL; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); @@ -734,7 +787,7 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, unsigned long start_addr, end_addr; int err = 0; - if (!check_ops_valid(ops)) + if (!check_ops_safe(ops)) return -EINVAL; lockdep_assert_held(&mapping->i_mmap_rwsem); @@ -868,23 +921,23 @@ struct folio *folio_walk_start(struct folio_walk *fw, fw->pudp = pudp; fw->pud = pud; - /* - * TODO: FW_MIGRATION support for PUD migration entries - * once there are relevant users. - */ - if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { + if (pud_none(pud)) { spin_unlock(ptl); goto not_found; - } else if (!pud_leaf(pud)) { + } else if (pud_present(pud) && !pud_leaf(pud)) { spin_unlock(ptl); goto pmd_table; + } else if (pud_present(pud)) { + page = vm_normal_page_pud(vma, addr, pud); + if (page) + goto found; } /* - * TODO: vm_normal_page_pud() will be handy once we want to - * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. + * TODO: FW_MIGRATION support for PUD migration entries + * once there are relevant users. */ - page = pud_page(pud); - goto found; + spin_unlock(ptl); + goto not_found; } pmd_table: @@ -920,10 +973,10 @@ pmd_table: goto found; } } else if ((flags & FW_MIGRATION) && - is_pmd_migration_entry(pmd)) { - swp_entry_t entry = pmd_to_swp_entry(pmd); + pmd_is_migration_entry(pmd)) { + const softleaf_t entry = softleaf_from_pmd(pmd); - page = pfn_swap_entry_to_page(entry); + page = softleaf_to_page(entry); expose_page = false; goto found; } @@ -954,11 +1007,10 @@ pte_table: goto found; } } else if (!pte_none(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); + const softleaf_t entry = softleaf_from_pte(pte); - if ((flags & FW_MIGRATION) && - is_migration_entry(entry)) { - page = pfn_swap_entry_to_page(entry); + if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) { + page = softleaf_to_page(entry); expose_page = false; goto found; } @@ -970,7 +1022,7 @@ not_found: found: if (expose_page) /* Note: Offset from the mapped page, not the folio start. */ - fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); + fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT); else fw->page = NULL; fw->ptl = ptl; |
