diff options
Diffstat (limited to 'mm/pgtable-generic.c')
| -rw-r--r-- | mm/pgtable-generic.c | 96 |
1 files changed, 82 insertions, 14 deletions
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4fcd959dcc4d..d3aec7a9926a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,7 +13,9 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mm_inline.h> -#include <asm/pgalloc.h> +#include <linux/iommu.h> +#include <linux/pgalloc.h> + #include <asm/tlb.h> /* @@ -139,8 +141,7 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && - !pmd_devmap(*pmdp)); + VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; @@ -153,7 +154,7 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t pud; VM_BUG_ON(address & ~HPAGE_PUD_MASK); - VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); + VM_BUG_ON(!pud_trans_huge(*pudp)); pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return pud; @@ -198,6 +199,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + VM_WARN_ON_ONCE(!pmd_present(*pmdp)); pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; @@ -208,6 +210,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + VM_WARN_ON_ONCE(!pmd_present(*pmdp)); return pmdp_invalidate(vma, address, pmdp); } #endif @@ -277,7 +280,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; @@ -289,9 +292,9 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) if (pmdvalp) *pmdvalp = pmdval; - if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) + if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) goto nomap; - if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) + if (unlikely(pmd_trans_huge(pmdval))) goto nomap; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); @@ -303,8 +306,8 @@ nomap: return NULL; } -pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp) +pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) { pmd_t pmdval; pte_t *pte; @@ -315,6 +318,19 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, return pte; } +pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdvalp, + spinlock_t **ptlp) +{ + pte_t *pte; + + VM_WARN_ON_ONCE(!pmdvalp); + pte = __pte_offset_map(pmd, addr, pmdvalp); + if (likely(pte)) + *ptlp = pte_lockptr(mm, pmdvalp); + return pte; +} + /* * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation * __pte_offset_map_lock() below, is usually called with the pmd pointer for @@ -345,14 +361,28 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s * afterwards. * - * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); + * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); * but when successful, it also outputs a pointer to the spinlock in ptlp - as * pte_offset_map_lock() does, but in this case without locking it. This helps * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time - * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock - * pointer for the page table that it returns. In principle, the caller should - * recheck *pmd once the lock is taken; in practice, no callsite needs that - - * either the mmap_lock for write, or pte_same() check on contents, is enough. + * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock + * pointer for the page table that it returns. Even after grabbing the spinlock, + * we might be looking either at a page table that is still mapped or one that + * was unmapped and is about to get freed. But for R/O access this is sufficient. + * So it is only applicable for read-only cases where any modification operations + * to the page table are not allowed even if the corresponding spinlock is held + * afterwards. + * + * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like + * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval. + * It is applicable for may-write cases where any modification operations to the + * page table may happen after the corresponding spinlock is held afterwards. + * But the users should make sure the page table is stable like checking pte_same() + * or checking pmd_same() by using the output pmdval before performing the write + * operations. + * + * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will + * be read-only/read-write protected. * * Note that free_pgtables(), used after unmapping detached vmas, or when * exiting the whole mm, does not take page table lock before freeing a page @@ -378,3 +408,41 @@ again: pte_unmap_unlock(pte, ptl); goto again; } + +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +static void kernel_pgtable_work_func(struct work_struct *work); + +static struct { + struct list_head list; + /* protect above ptdesc lists */ + spinlock_t lock; + struct work_struct work; +} kernel_pgtable_work = { + .list = LIST_HEAD_INIT(kernel_pgtable_work.list), + .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock), + .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func), +}; + +static void kernel_pgtable_work_func(struct work_struct *work) +{ + struct ptdesc *pt, *next; + LIST_HEAD(page_list); + + spin_lock(&kernel_pgtable_work.lock); + list_splice_tail_init(&kernel_pgtable_work.list, &page_list); + spin_unlock(&kernel_pgtable_work.lock); + + iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); + list_for_each_entry_safe(pt, next, &page_list, pt_list) + __pagetable_free(pt); +} + +void pagetable_free_kernel(struct ptdesc *pt) +{ + spin_lock(&kernel_pgtable_work.lock); + list_add(&pt->pt_list, &kernel_pgtable_work.list); + spin_unlock(&kernel_pgtable_work.lock); + + schedule_work(&kernel_pgtable_work.work); +} +#endif |
