diff options
Diffstat (limited to 'include/linux/pagewalk.h')
| -rw-r--r-- | include/linux/pagewalk.h | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h new file mode 100644 index 000000000000..88e18615dd72 --- /dev/null +++ b/include/linux/pagewalk.h @@ -0,0 +1,207 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PAGEWALK_H +#define _LINUX_PAGEWALK_H + +#include <linux/mm.h> + +struct mm_walk; + +/* Locking requirement during a page walk. */ +enum page_walk_lock { + /* mmap_lock should be locked for read to stabilize the vma tree */ + PGWALK_RDLOCK = 0, + /* vma will be write-locked during the walk */ + PGWALK_WRLOCK = 1, + /* vma is expected to be already write-locked during the walk */ + PGWALK_WRLOCK_VERIFY = 2, + /* vma is expected to be already read-locked during the walk */ + PGWALK_VMA_RDLOCK_VERIFY = 3, +}; + +/** + * struct mm_walk_ops - callbacks for walk_page_range + * @pgd_entry: if set, called for each non-empty PGD (top-level) entry + * @p4d_entry: if set, called for each non-empty P4D entry + * @pud_entry: if set, called for each non-empty PUD entry + * @pmd_entry: if set, called for each non-empty PMD entry + * this handler is required to be able to handle + * pmd_trans_huge() pmds. They may simply choose to + * split_huge_page() instead of handling it explicitly. + * @pte_entry: if set, called for each PTE (lowest-level) entry + * including empty ones, except if @install_pte is set. + * If @install_pte is set, @pte_entry is called only for + * existing PTEs. + * @pte_hole: if set, called for each hole at all levels, + * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. + * Any folded depths (where PTRS_PER_P?D is equal to 1) + * are skipped. If @install_pte is specified, this will + * not trigger for any populated ranges. + * @hugetlb_entry: if set, called for each hugetlb entry. This hook + * function is called with the vma lock held, in order to + * protect against a concurrent freeing of the pte_t* or + * the ptl. In some cases, the hook function needs to drop + * and retake the vma lock in order to avoid deadlocks + * while calling other functions. In such cases the hook + * function must either refrain from accessing the pte or + * ptl after dropping the vma lock, or else revalidate + * those items after re-acquiring the vma lock and before + * accessing them. + * @test_walk: caller specific callback function to determine whether + * we walk over the current vma or not. Returning 0 means + * "do page table walk over the current vma", returning + * a negative value means "abort current page table walk + * right now" and returning 1 means "skip the current vma" + * Note that this callback is not called when the caller + * passes in a single VMA as for walk_page_vma(). + * @pre_vma: if set, called before starting walk on a non-null vma. + * @post_vma: if set, called after a walk on a non-null vma, provided + * that @pre_vma and the vma walk succeeded. + * @install_pte: if set, missing page table entries are installed and + * thus all levels are always walked in the specified + * range. This callback is then invoked at the PTE level + * (having split any THP pages prior), providing the PTE to + * install. If allocations fail, the walk is aborted. This + * operation is only available for userland memory. Not + * usable for hugetlb ranges. + * + * p?d_entry callbacks are called even if those levels are folded on a + * particular architecture/configuration. + */ +struct mm_walk_ops { + int (*pgd_entry)(pgd_t *pgd, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*p4d_entry)(p4d_t *p4d, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pud_entry)(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pmd_entry)(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pte_entry)(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pte_hole)(unsigned long addr, unsigned long next, + int depth, struct mm_walk *walk); + int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk); + int (*test_walk)(unsigned long addr, unsigned long next, + struct mm_walk *walk); + int (*pre_vma)(unsigned long start, unsigned long end, + struct mm_walk *walk); + void (*post_vma)(struct mm_walk *walk); + int (*install_pte)(unsigned long addr, unsigned long next, + pte_t *ptep, struct mm_walk *walk); + enum page_walk_lock walk_lock; +}; + +/* + * Action for pud_entry / pmd_entry callbacks. + * ACTION_SUBTREE is the default + */ +enum page_walk_action { + /* Descend to next level, splitting huge pages if needed and possible */ + ACTION_SUBTREE = 0, + /* Continue to next entry at this level (ignoring any subtree) */ + ACTION_CONTINUE = 1, + /* Call again for this entry */ + ACTION_AGAIN = 2 +}; + +/** + * struct mm_walk - walk_page_range data + * @ops: operation to call during the walk + * @mm: mm_struct representing the target process of page table walk + * @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL) + * @vma: vma currently walked (NULL if walking outside vmas) + * @action: next action to perform (see enum page_walk_action) + * @no_vma: walk ignoring vmas (vma will always be NULL) + * @private: private data for callbacks' usage + * + * (see the comment on walk_page_range() for more details) + */ +struct mm_walk { + const struct mm_walk_ops *ops; + struct mm_struct *mm; + pgd_t *pgd; + struct vm_area_struct *vma; + enum page_walk_action action; + bool no_vma; + void *private; +}; + +int walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); +int walk_kernel_page_table_range(unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); +int walk_kernel_page_table_range_lockless(unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); +int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, + void *private); +int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, + pgoff_t nr, const struct mm_walk_ops *ops, + void *private); + +typedef int __bitwise folio_walk_flags_t; + +/* + * Walk migration entries as well. Careful: a large folio might get split + * concurrently. + */ +#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) + +/* Walk shared zeropages (small + huge) as well. */ +#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) + +enum folio_walk_level { + FW_LEVEL_PTE, + FW_LEVEL_PMD, + FW_LEVEL_PUD, +}; + +/** + * struct folio_walk - folio_walk_start() / folio_walk_end() data + * @page: exact folio page referenced (if applicable) + * @level: page table level identifying the entry type + * @pte: pointer to the page table entry (FW_LEVEL_PTE). + * @pmd: pointer to the page table entry (FW_LEVEL_PMD). + * @pud: pointer to the page table entry (FW_LEVEL_PUD). + * @ptl: pointer to the page table lock. + * + * (see folio_walk_start() documentation for more details) + */ +struct folio_walk { + /* public */ + struct page *page; + enum folio_walk_level level; + union { + pte_t *ptep; + pud_t *pudp; + pmd_t *pmdp; + }; + union { + pte_t pte; + pud_t pud; + pmd_t pmd; + }; + /* private */ + struct vm_area_struct *vma; + spinlock_t *ptl; +}; + +struct folio *folio_walk_start(struct folio_walk *fw, + struct vm_area_struct *vma, unsigned long addr, + folio_walk_flags_t flags); + +#define folio_walk_end(__fw, __vma) do { \ + spin_unlock((__fw)->ptl); \ + if (likely((__fw)->level == FW_LEVEL_PTE)) \ + pte_unmap((__fw)->ptep); \ + vma_pgtable_walk_end(__vma); \ +} while (0) + +#endif /* _LINUX_PAGEWALK_H */ |
