diff options
Diffstat (limited to 'mm/pgtable-generic.c')
| -rw-r--r-- | mm/pgtable-generic.c | 181 |
1 files changed, 171 insertions, 10 deletions
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4d454953046f..d3aec7a9926a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,6 +13,9 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mm_inline.h> +#include <linux/iommu.h> +#include <linux/pgalloc.h> + #include <asm/tlb.h> /* @@ -138,8 +141,7 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && - !pmd_devmap(*pmdp)); + VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; @@ -152,7 +154,7 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t pud; VM_BUG_ON(address & ~HPAGE_PUD_MASK); - VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); + VM_BUG_ON(!pud_trans_huge(*pudp)); pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return pud; @@ -197,6 +199,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + VM_WARN_ON_ONCE(!pmd_present(*pmdp)); pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; @@ -207,6 +210,7 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + VM_WARN_ON_ONCE(!pmd_present(*pmdp)); return pmdp_invalidate(vma, address, pmdp); } #endif @@ -230,19 +234,67 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, return pmd; } #endif + +/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */ +#ifndef pte_free_defer +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + pte_free(NULL /* mm not passed and not used */, (pgtable_t)page); +} + +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = pgtable; + call_rcu(&page->rcu_head, pte_free_now); +} +#endif /* pte_free_defer */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) +#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ + (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU)) +/* + * See the comment above ptep_get_lockless() in include/linux/pgtable.h: + * the barriers in pmdp_get_lockless() cannot guarantee that the value in + * pmd_high actually belongs with the value in pmd_low; but holding interrupts + * off blocks the TLB flush between present updates, which guarantees that a + * successful __pte_offset_map() points to a page from matched halves. + */ +static unsigned long pmdp_get_lockless_start(void) +{ + unsigned long irqflags; + + local_irq_save(irqflags); + return irqflags; +} +static void pmdp_get_lockless_end(unsigned long irqflags) +{ + local_irq_restore(irqflags); +} +#else +static unsigned long pmdp_get_lockless_start(void) { return 0; } +static void pmdp_get_lockless_end(unsigned long irqflags) { } +#endif + +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { + unsigned long irqflags; pmd_t pmdval; - /* rcu_read_lock() to be added later */ + rcu_read_lock(); + irqflags = pmdp_get_lockless_start(); pmdval = pmdp_get_lockless(pmd); + pmdp_get_lockless_end(irqflags); + if (pmdvalp) *pmdvalp = pmdval; - if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) + if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) goto nomap; - if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) + if (unlikely(pmd_trans_huge(pmdval))) goto nomap; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); @@ -250,12 +302,12 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) } return __pte_map(&pmdval, addr); nomap: - /* rcu_read_unlock() to be added later */ + rcu_read_unlock(); return NULL; } -pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp) +pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) { pmd_t pmdval; pte_t *pte; @@ -266,6 +318,77 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, return pte; } +pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdvalp, + spinlock_t **ptlp) +{ + pte_t *pte; + + VM_WARN_ON_ONCE(!pmdvalp); + pte = __pte_offset_map(pmd, addr, pmdvalp); + if (likely(pte)) + *ptlp = pte_lockptr(mm, pmdvalp); + return pte; +} + +/* + * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation + * __pte_offset_map_lock() below, is usually called with the pmd pointer for + * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while + * holding mmap_lock or vma lock for read or for write; or in truncate or rmap + * context, while holding file's i_mmap_lock or anon_vma lock for read (or for + * write). In a few cases, it may be used with pmd pointing to a pmd_t already + * copied to or constructed on the stack. + * + * When successful, it returns the pte pointer for addr, with its page table + * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent + * modification by software, with a pointer to that spinlock in ptlp (in some + * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's + * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards. + * + * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no + * page table at *pmd: if, for example, the page table has just been removed, + * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked + * after acquiring the ptlock, and retried internally if it changed: so that a + * page table can be safely removed or replaced by THP while holding its lock.) + * + * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above, + * just returns the pte pointer for addr, its page table kmapped if necessary; + * or NULL if there is no page table at *pmd. It does not attempt to lock the + * page table, so cannot normally be used when the page table is to be updated, + * or when entries read must be stable. But it does take rcu_read_lock(): so + * that even when page table is racily removed, it remains a valid though empty + * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s + * afterwards. + * + * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); + * but when successful, it also outputs a pointer to the spinlock in ptlp - as + * pte_offset_map_lock() does, but in this case without locking it. This helps + * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time + * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock + * pointer for the page table that it returns. Even after grabbing the spinlock, + * we might be looking either at a page table that is still mapped or one that + * was unmapped and is about to get freed. But for R/O access this is sufficient. + * So it is only applicable for read-only cases where any modification operations + * to the page table are not allowed even if the corresponding spinlock is held + * afterwards. + * + * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like + * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval. + * It is applicable for may-write cases where any modification operations to the + * page table may happen after the corresponding spinlock is held afterwards. + * But the users should make sure the page table is stable like checking pte_same() + * or checking pmd_same() by using the output pmdval before performing the write + * operations. + * + * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will + * be read-only/read-write protected. + * + * Note that free_pgtables(), used after unmapping detached vmas, or when + * exiting the whole mm, does not take page table lock before freeing a page + * table, and may not use RCU at all: "outsiders" like khugepaged should avoid + * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. + */ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { @@ -285,3 +408,41 @@ again: pte_unmap_unlock(pte, ptl); goto again; } + +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +static void kernel_pgtable_work_func(struct work_struct *work); + +static struct { + struct list_head list; + /* protect above ptdesc lists */ + spinlock_t lock; + struct work_struct work; +} kernel_pgtable_work = { + .list = LIST_HEAD_INIT(kernel_pgtable_work.list), + .lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock), + .work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func), +}; + +static void kernel_pgtable_work_func(struct work_struct *work) +{ + struct ptdesc *pt, *next; + LIST_HEAD(page_list); + + spin_lock(&kernel_pgtable_work.lock); + list_splice_tail_init(&kernel_pgtable_work.list, &page_list); + spin_unlock(&kernel_pgtable_work.lock); + + iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); + list_for_each_entry_safe(pt, next, &page_list, pt_list) + __pagetable_free(pt); +} + +void pagetable_free_kernel(struct ptdesc *pt) +{ + spin_lock(&kernel_pgtable_work.lock); + list_add(&pt->pt_list, &kernel_pgtable_work.list); + spin_unlock(&kernel_pgtable_work.lock); + + schedule_work(&kernel_pgtable_work.work); +} +#endif |
