diff options
Diffstat (limited to 'include/linux/pgtable.h')
| -rw-r--r-- | include/linux/pgtable.h | 812 |
1 files changed, 671 insertions, 141 deletions
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f49abcfe5eda..652f287c1ef6 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -50,6 +50,8 @@ #define pmd_pgtable(pmd) pmd_page(pmd) #endif +#define pmd_folio(pmd) page_folio(pmd_page(pmd)) + /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * @@ -88,6 +90,27 @@ static inline unsigned long pud_index(unsigned long address) #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif +#ifndef kernel_pte_init +static inline void kernel_pte_init(void *addr) +{ +} +#define kernel_pte_init kernel_pte_init +#endif + +#ifndef pmd_init +static inline void pmd_init(void *addr) +{ +} +#define pmd_init pmd_init +#endif + +#ifndef pud_init +static inline void pud_init(void *addr) +{ +} +#define pud_init pud_init +#endif + #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { @@ -149,9 +172,7 @@ static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ -#ifndef pgd_offset_k #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) -#endif /* * In many cases it is known that a virtual address is mapped at PMD or PTE @@ -184,6 +205,13 @@ static inline int pmd_young(pmd_t pmd) } #endif +#ifndef pmd_dirty +static inline int pmd_dirty(pmd_t pmd) +{ + return 0; +} +#endif + /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode @@ -194,17 +222,51 @@ static inline int pmd_young(pmd_t pmd) * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be - * up to date. This mode can only be entered and left under the protection of - * the page table locks for all page tables which may be modified. In the UP - * case, this is required so that preemption is disabled, and in the SMP case, - * it must synchronize the delayed page table writes properly on other CPUs. + * up to date. + * + * In the general case, no lock is guaranteed to be held between entry and exit + * of the lazy mode. So the implementation must assume preemption may be enabled + * and cpu migration is possible; it must take steps to be robust against this. + * (In practice, for user PTE updates, the appropriate page table lock(s) are + * held, but for kernel PTE updates, no lock is held). Nesting is not permitted + * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) +static inline void arch_enter_lazy_mmu_mode(void) {} +static inline void arch_leave_lazy_mmu_mode(void) {} +static inline void arch_flush_lazy_mmu_mode(void) {} #endif +#ifndef pte_batch_hint +/** + * pte_batch_hint - Number of pages that can be added to batch without scanning. + * @ptep: Page table pointer for the entry. + * @pte: Page table entry. + * + * Some architectures know that a set of contiguous ptes all map the same + * contiguous memory with the same permissions. In this case, it can provide a + * hint to aid pte batching without the core code needing to scan every pte. + * + * An architecture implementation may ignore the PTE accessed state. Further, + * the dirty state must apply atomically to all the PTEs described by the hint. + * + * May be overridden by the architecture, else pte_batch_hint is always 1. + */ +static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) +{ + return 1; +} +#endif + +#ifndef pte_advance_pfn +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) +{ + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); +} +#endif + +#define pte_next_pfn(pte) pte_advance_pfn(pte, 1) + #ifndef set_ptes /** * set_ptes - Map consecutive pages to a contiguous range of addresses. @@ -214,6 +276,10 @@ static inline int pmd_young(pmd_t pmd) * @pte: Page table entry for the first page. * @nr: Number of pages to map. * + * When nr==1, initial state of pte may be present or not present, and new state + * may be present or not present. When nr>1, initial state of all ptes must be + * not present, and new state must be present. + * * May be overridden by the architecture, or the architecture can define * set_pte() and PFN_PTE_SHIFT. * @@ -225,15 +291,13 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, { page_table_check_ptes_set(mm, ptep, pte, nr); - arch_enter_lazy_mmu_mode(); for (;;) { set_pte(ptep, pte); if (--nr == 0) break; ptep++; - pte = __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + pte = pte_next_pfn(pte); } - arch_leave_lazy_mmu_mode(); } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) @@ -284,6 +348,27 @@ static inline pmd_t pmdp_get(pmd_t *pmdp) } #endif +#ifndef pudp_get +static inline pud_t pudp_get(pud_t *pudp) +{ + return READ_ONCE(*pudp); +} +#endif + +#ifndef p4dp_get +static inline p4d_t p4dp_get(p4d_t *p4dp) +{ + return READ_ONCE(*p4dp); +} +#endif + +#ifndef pgdp_get +static inline pgd_t pgdp_get(pgd_t *pgdp) +{ + return READ_ONCE(*pgdp); +} +#endif + #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, @@ -367,7 +452,38 @@ static inline bool arch_has_hw_nonleaf_pmd_young(void) */ static inline bool arch_has_hw_pte_young(void) { - return false; + return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); +} +#endif + +#ifndef exec_folio_order +/* + * Returns preferred minimum folio order for executable file-backed memory. Must + * be in range [0, PMD_ORDER). Default to order-0. + */ +static inline unsigned int exec_folio_order(void) +{ + return 0; +} +#endif + +#ifndef arch_check_zapped_pte +static inline void arch_check_zapped_pte(struct vm_area_struct *vma, + pte_t pte) +{ +} +#endif + +#ifndef arch_check_zapped_pmd +static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, + pmd_t pmd) +{ +} +#endif + +#ifndef arch_check_zapped_pud +static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) +{ } #endif @@ -383,10 +499,61 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #endif +#ifndef clear_young_dirty_ptes +/** + * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the + * same folio as old/clean. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to mark old/clean. + * @flags: Flags to modify the PTE batch semantics. + * + * May be overridden by the architecture; otherwise, implemented by + * get_and_clear/modify/set for each pte in the range. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + unsigned int nr, cydp_t flags) +{ + pte_t pte; + + for (;;) { + if (flags == CYDP_CLEAR_YOUNG) + ptep_test_and_clear_young(vma, addr, ptep); + else { + pte = ptep_get_and_clear(vma->vm_mm, addr, ptep); + if (flags & CYDP_CLEAR_YOUNG) + pte = pte_mkold(pte); + if (flags & CYDP_CLEAR_DIRTY) + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, addr, ptep, pte); + } + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif + static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep_get_and_clear(mm, addr, ptep); + pte_t pte = ptep_get(ptep); + + pte_clear(mm, addr, ptep); + /* + * No need for ptep_get_and_clear(): page table check doesn't care about + * any bits that could have been set by HW concurrently. + */ + page_table_check_pte_clear(mm, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH @@ -530,6 +697,121 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } #endif +#ifndef get_and_clear_full_ptes +/** + * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the + * returned PTE. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr, int full) +{ + pte_t pte, tmp_pte; + + pte = ptep_get_and_clear_full(mm, addr, ptep, full); + while (--nr) { + ptep++; + addr += PAGE_SIZE; + tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full); + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } + return pte; +} +#endif + +/** + * get_and_clear_ptes - Clear present PTEs that map consecutive pages of + * the same folio, collecting dirty/accessed bits. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * + * Use this instead of get_and_clear_full_ptes() if it is known that we don't + * need to clear the full mm, which is mostly the case. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + return get_and_clear_full_ptes(mm, addr, ptep, nr, 0); +} + +#ifndef clear_full_ptes +/** + * clear_full_ptes - Clear present PTEs that map consecutive pages of the same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_get_and_clear_full(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + ptep_get_and_clear_full(mm, addr, ptep, full); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif + +/** + * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * + * Use this instead of clear_full_ptes() if it is known that we don't need to + * clear the full mm, which is mostly the case. + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void clear_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + clear_full_ptes(mm, addr, ptep, nr, 0); +} /* * If two threads concurrently fault at the same page, the thread that @@ -539,13 +821,18 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ -#ifndef __HAVE_ARCH_UPDATE_MMU_TLB +#ifndef update_mmu_tlb_range +static inline void update_mmu_tlb_range(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) +{ +} +#endif + static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + update_mmu_tlb_range(vma, address, ptep, 1); } -#define __HAVE_ARCH_UPDATE_MMU_TLB -#endif /* * Some architectures may be able to avoid expensive synchronization @@ -562,6 +849,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm, } #endif +#ifndef clear_not_present_full_ptes +/** + * clear_not_present_full_ptes - Clear multiple not present PTEs which are + * consecutive in the pgtable. + * @mm: Address space the ptes represent. + * @addr: Address of the first pte. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to clear. + * @full: Whether we are clearing a full mm. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over pte_clear_not_present_full(). + * + * Context: The caller holds the page table lock. The PTEs are all not present. + * The PTEs are all in the same PMD. + */ +static inline void clear_not_present_full_ptes(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, unsigned int nr, int full) +{ + for (;;) { + pte_clear_not_present_full(mm, addr, ptep, full); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif + #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, @@ -577,6 +893,20 @@ extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, pud_t *pudp); #endif +#ifndef pte_mkwrite +static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + return pte_mkwrite_novma(pte); +} +#endif + +#if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite) +static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + return pmd_mkwrite_novma(pmd); +} +#endif + #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) @@ -586,6 +916,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef wrprotect_ptes +/** + * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same + * folio. + * @mm: Address space the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries to write-protect. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_set_wrprotect(). + * + * Note that PTE bits in the PTE range besides the PFN can differ. For example, + * some PTEs might be write-protected. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. The PTEs are all in the same PMD. + */ +static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ + for (;;) { + ptep_set_wrprotect(mm, addr, ptep); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + } +} +#endif + /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings @@ -781,45 +1142,16 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) } #endif -/* - * Use set_p*_safe(), and elide TLB flushing, when confident that *no* - * TLB flush will be required as a result of the "set". For example, use - * in scenarios where it is known ahead of time that the routine is - * setting non-present entries, or re-setting an existing entry to the - * same value. Otherwise, use the typical "set" helpers and flush the - * TLB. - */ -#define set_pte_safe(ptep, pte) \ -({ \ - WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ - set_pte(ptep, pte); \ -}) - -#define set_pmd_safe(pmdp, pmd) \ -({ \ - WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ - set_pmd(pmdp, pmd); \ -}) - -#define set_pud_safe(pudp, pud) \ -({ \ - WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ - set_pud(pudp, pud); \ -}) - -#define set_p4d_safe(p4dp, p4d) \ -({ \ - WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ - set_p4d(p4dp, p4d); \ -}) - -#define set_pgd_safe(pgdp, pgd) \ -({ \ - WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ - set_pgd(pgdp, pgd); \ -}) - #ifndef __HAVE_ARCH_DO_SWAP_PAGE +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + +} +#else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be @@ -828,12 +1160,17 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ -static inline void arch_do_swap_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, - pte_t pte, pte_t oldpte) -{ - +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + for (int i = 0; i < nr; i++) { + arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, + pte_advance_pfn(pte, i), + pte_advance_pfn(oldpte, i)); + } } #endif @@ -861,7 +1198,7 @@ static inline int arch_unmap_one(struct mm_struct *mm, * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP -static inline int arch_prepare_to_swap(struct page *page) +static inline int arch_prepare_to_swap(struct folio *folio) { return 0; } @@ -883,12 +1220,8 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) } #endif -#ifndef __HAVE_ARCH_PGD_OFFSET_GATE -#define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) -#endif - #ifndef __HAVE_ARCH_MOVE_PTE -#define move_pte(pte, prot, old_addr, new_addr) (pte) +#define move_pte(pte, old_addr, new_addr) (pte) #endif #ifndef pte_accessible @@ -899,6 +1232,10 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif +#ifndef flush_tlb_fix_spurious_fault_pmd +#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0) +#endif + /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no @@ -1043,7 +1380,9 @@ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, /* * Commit an update to a pte, leaving any hardware-controlled bits in - * the PTE unmodified. + * the PTE unmodified. The pte returned from ptep_modify_prot_start() may + * additionally have young and/or dirty bits set where previously they were not, + * so the updated pte may have these additional changes. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, @@ -1052,6 +1391,102 @@ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ + +/** + * modify_prot_start_ptes - Start a pte protection read-modify-write transaction + * over a batch of ptes, which protects against asynchronous hardware + * modifications to the ptes. The intention is not to prevent the hardware from + * making pte updates, but to prevent any updates it may make from being lost. + * Please see the comment above ptep_modify_prot_start() for full description. + * + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @nr: Number of entries. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte + * in the batch. + * + * Note that PTE bits in the PTE batch besides the PFN can differ. + * + * Context: The caller holds the page table lock. The PTEs map consecutive + * pages that belong to the same folio. All other PTE bits must be identical for + * all PTEs in the batch except for young and dirty bits. The PTEs are all in + * the same PMD. + */ +#ifndef modify_prot_start_ptes +static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) +{ + pte_t pte, tmp_pte; + + pte = ptep_modify_prot_start(vma, addr, ptep); + while (--nr) { + ptep++; + addr += PAGE_SIZE; + tmp_pte = ptep_modify_prot_start(vma, addr, ptep); + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } + return pte; +} +#endif + +/** + * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any + * hardware-controlled bits in the PTE unmodified. + * + * @vma: The virtual memory area the pages are mapped into. + * @addr: Address the first page is mapped at. + * @ptep: Page table pointer for the first entry. + * @old_pte: Old page table entry (for the first entry) which is now cleared. + * @pte: New page table entry to be set. + * @nr: Number of entries. + * + * May be overridden by the architecture; otherwise, implemented as a simple + * loop over ptep_modify_prot_commit(). + * + * Context: The caller holds the page table lock. The PTEs are all in the same + * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by + * ptep_modify_prot_start() may additionally have young and/or dirty bits set + * where previously they were not, so the updated ptes may have these + * additional changes. + */ +#ifndef modify_prot_commit_ptes +static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr) +{ + int i; + + for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) { + ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); + + /* Advance PFN only, set same prot */ + old_pte = pte_next_pfn(old_pte); + pte = pte_next_pfn(pte); + } +} +#endif + +/* + * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values + * and let generic vmalloc, ioremap and page table update code know when + * arch_sync_kernel_mappings() needs to be called. + */ +#ifndef ARCH_PAGE_TABLE_SYNC_MASK +#define ARCH_PAGE_TABLE_SYNC_MASK 0 +#endif + +/* + * There is no default implementation for arch_sync_kernel_mappings(). It is + * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK + * is 0. + */ +void arch_sync_kernel_mappings(unsigned long start, unsigned long end); + #endif /* CONFIG_MMU */ /* @@ -1122,6 +1557,18 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif +/* + * Some platforms can customize the PTE soft-dirty bit making it unavailable + * even if the architecture provides the resource. + * Adding this API allows architectures to add their own checks for the + * devices on which the kernel is running. + * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY + * is part of this macro. + */ +#ifndef pgtable_supports_soft_dirty +#define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) +#endif + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) @@ -1208,64 +1655,92 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) * vmf_insert_pfn. */ -/* - * track_pfn_remap is called when a _new_ pfn mapping is being established - * by remap_pfn_range() for physical range indicated by pfn and size. - */ -static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long addr, - unsigned long size) +static inline int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, + pgprot_t *prot) { return 0; } -/* - * track_pfn_insert is called when a _new_ single pfn is established - * by vmf_insert_pfn(). - */ -static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - pfn_t pfn) +static inline int pfnmap_track(unsigned long pfn, unsigned long size, + pgprot_t *prot) { + return 0; } -/* - * track_pfn_copy is called when vma that is covering the pfnmap gets - * copied through copy_page_range(). - */ -static inline int track_pfn_copy(struct vm_area_struct *vma) +static inline void pfnmap_untrack(unsigned long pfn, unsigned long size) { - return 0; } +#else +/** + * pfnmap_setup_cachemode - setup the cachemode in the pgprot for a pfn range + * @pfn: the start of the pfn range + * @size: the size of the pfn range in bytes + * @prot: the pgprot to modify + * + * Lookup the cachemode for the pfn range starting at @pfn with the size + * @size and store it in @prot, leaving other data in @prot unchanged. + * + * This allows for a hardware implementation to have fine-grained control of + * memory cache behavior at page level granularity. Without a hardware + * implementation, this function does nothing. + * + * Currently there is only one implementation for this - x86 Page Attribute + * Table (PAT). See Documentation/arch/x86/pat.rst for more details. + * + * This function can fail if the pfn range spans pfns that require differing + * cachemodes. If the pfn range was previously verified to have a single + * cachemode, it is sufficient to query only a single pfn. The assumption is + * that this is the case for drivers using the vmf_insert_pfn*() interface. + * + * Returns 0 on success and -EINVAL on error. + */ +int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, + pgprot_t *prot); -/* - * untrack_pfn is called while unmapping a pfnmap for a region. - * untrack can be called for a specific region indicated by pfn and size or - * can be for the entire vma (in which case pfn, size are zero). +/** + * pfnmap_track - track a pfn range + * @pfn: the start of the pfn range + * @size: the size of the pfn range in bytes + * @prot: the pgprot to track + * + * Requested the pfn range to be 'tracked' by a hardware implementation and + * setup the cachemode in @prot similar to pfnmap_setup_cachemode(). + * + * This allows for fine-grained control of memory cache behaviour at page + * level granularity. Tracking memory this way is persisted across VMA splits + * (VMA merging does not apply for VM_PFNMAP). + * + * Currently, there is only one implementation for this - x86 Page Attribute + * Table (PAT). See Documentation/arch/x86/pat.rst for more details. + * + * Returns 0 on success and -EINVAL on error. */ -static inline void untrack_pfn(struct vm_area_struct *vma, - unsigned long pfn, unsigned long size, - bool mm_wr_locked) -{ -} +int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot); -/* - * untrack_pfn_clear is called while mremapping a pfnmap for a new region - * or fails to copy pgtable during duplicate vm area. +/** + * pfnmap_untrack - untrack a pfn range + * @pfn: the start of the pfn range + * @size: the size of the pfn range in bytes + * + * Untrack a pfn range previously tracked through pfnmap_track(). + */ +void pfnmap_untrack(unsigned long pfn, unsigned long size); +#endif + +/** + * pfnmap_setup_cachemode_pfn - setup the cachemode in the pgprot for a pfn + * @pfn: the pfn + * @prot: the pgprot to modify + * + * Lookup the cachemode for @pfn and store it in @prot, leaving other + * data in @prot unchanged. + * + * See pfnmap_setup_cachemode() for details. */ -static inline void untrack_pfn_clear(struct vm_area_struct *vma) +static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot) { + pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot); } -#else -extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn, unsigned long addr, - unsigned long size); -extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - pfn_t pfn); -extern int track_pfn_copy(struct vm_area_struct *vma); -extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size, bool mm_wr_locked); -extern void untrack_pfn_clear(struct vm_area_struct *vma); -#endif #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE @@ -1327,21 +1802,6 @@ static inline int pud_write(pud_t pud) } #endif /* pud_write */ -#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline int pmd_devmap(pmd_t pmd) -{ - return 0; -} -static inline int pud_devmap(pud_t pud) -{ - return 0; -} -static inline int pgd_devmap(pgd_t pgd) -{ - return 0; -} -#endif - #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static inline int pud_trans_huge(pud_t pud) @@ -1356,7 +1816,7 @@ static inline int pud_trans_unstable(pud_t *pud) defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudval = READ_ONCE(*pud); - if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) + if (pud_none(pudval) || pud_trans_huge(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); @@ -1510,10 +1970,11 @@ static inline bool arch_has_pfn_modify_check(void) /* * Page Table Modification bits for pgtbl_mod_mask. * - * These are used by the p?d_alloc_track*() set of functions an in the generic - * vmalloc/ioremap code to track at which page-table levels entries have been - * modified. Based on that the code can better decide when vmalloc and ioremap - * mapping changes need to be synchronized to other page-tables in the system. + * These are used by the p?d_alloc_track*() and p*d_populate_kernel() + * functions in the generic vmalloc, ioremap and page table update code + * to track at which page-table levels entries have been modified. + * Based on that the code can better decide when page table changes need + * to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 @@ -1530,6 +1991,32 @@ static inline bool arch_has_pfn_modify_check(void) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; +enum pgtable_level { + PGTABLE_LEVEL_PTE = 0, + PGTABLE_LEVEL_PMD, + PGTABLE_LEVEL_PUD, + PGTABLE_LEVEL_P4D, + PGTABLE_LEVEL_PGD, +}; + +static inline const char *pgtable_level_to_str(enum pgtable_level level) +{ + switch (level) { + case PGTABLE_LEVEL_PTE: + return "pte"; + case PGTABLE_LEVEL_PMD: + return "pmd"; + case PGTABLE_LEVEL_PUD: + return "pud"; + case PGTABLE_LEVEL_P4D: + return "p4d"; + case PGTABLE_LEVEL_PGD: + return "pgd"; + default: + return "unknown"; + } +} + #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) @@ -1579,23 +2066,37 @@ typedef unsigned int pgtbl_mod_mask; #endif /* - * p?d_leaf() - true if this entry is a final mapping to a physical address. - * This differs from p?d_huge() by the fact that they are always available (if - * the architecture supports large pages at the appropriate level) even - * if CONFIG_HUGETLB_PAGE is not defined. - * Only meaningful when called on a valid entry. + * pXd_leaf() is the API to check whether a pgtable entry is a huge page + * mapping. It should work globally across all archs, without any + * dependency on CONFIG_* options. For architectures that do not support + * huge mappings on specific levels, below fallbacks will be used. + * + * A leaf pgtable entry should always imply the following: + * + * - It is a "present" entry. IOW, before using this API, please check it + * with pXd_present() first. NOTE: it may not always mean the "present + * bit" is set. For example, PROT_NONE entries are always "present". + * + * - It should _never_ be a swap entry of any type. Above "present" check + * should have guarded this, but let's be crystal clear on this. + * + * - It should contain a huge PFN, which points to a huge page larger than + * PAGE_SIZE of the platform. The PFN format isn't important here. + * + * - It should cover all kinds of huge mappings (i.e. pXd_trans_huge() + * or hugetlb mappings). */ #ifndef pgd_leaf -#define pgd_leaf(x) 0 +#define pgd_leaf(x) false #endif #ifndef p4d_leaf -#define p4d_leaf(x) 0 +#define p4d_leaf(x) false #endif #ifndef pud_leaf -#define pud_leaf(x) 0 +#define pud_leaf(x) false #endif #ifndef pmd_leaf -#define pmd_leaf(x) 0 +#define pmd_leaf(x) false #endif #ifndef pgd_leaf_size @@ -1610,9 +2111,26 @@ typedef unsigned int pgtbl_mod_mask; #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif +#ifndef __pte_leaf_size #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif +#define __pte_leaf_size(x,y) pte_leaf_size(y) +#endif + +/* + * We always define pmd_pfn for all archs as it's used in lots of generic + * code. Now it happens too for pud_pfn (and can happen for larger + * mappings too in the future; we're not there yet). Instead of defining + * it for all archs (like pmd_pfn), provide a fallback. + * + * Note that returning 0 here means any arch that didn't define this can + * get severely wrong when it hits a real pud leaf. It's arch's + * responsibility to properly define it when a huge pud is possible. + */ +#ifndef pud_pfn +#define pud_pfn(x) 0 +#endif /* * Some architectures have MMUs that are configurable or selectable at boot @@ -1636,6 +2154,18 @@ typedef unsigned int pgtbl_mod_mask; #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif +#ifndef pte_pgprot +#define pte_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pmd_pgprot +#define pmd_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pud_pgprot +#define pud_pgprot(x) ((pgprot_t) {0}) +#endif + /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: @@ -1657,7 +2187,7 @@ typedef unsigned int pgtbl_mod_mask; * x: (yes) yes */ #define DECLARE_VM_GET_PAGE_PROT \ -pgprot_t vm_get_page_prot(unsigned long vm_flags) \ +pgprot_t vm_get_page_prot(vm_flags_t vm_flags) \ { \ return protection_map[vm_flags & \ (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ |
