summaryrefslogtreecommitdiff
path: root/mm/pgtable-generic.c
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2023-06-08 18:10:32 -0700
committerAndrew Morton <akpm@linux-foundation.org>2023-06-19 16:19:12 -0700
commit0d940a9b270b9220dcff74d8e9123c9788365751 (patch)
tree253b3e1f57988b7b4e1bbcf5c8c9b26fa510001b /mm/pgtable-generic.c
parent46c475bd676bb05077c8a38b37f175552f035406 (diff)
mm/pgtable: allow pte_offset_map[_lock]() to fail
Make pte_offset_map() a wrapper for __pte_offset_map() (optionally outputs pmdval), pte_offset_map_lock() a sparse __cond_lock wrapper for __pte_offset_map_lock(): those __funcs added in mm/pgtable-generic.c. __pte_offset_map() do pmdval validation (including pmd_clear_bad() when pmd_bad()), returning NULL if pmdval is not for a page table. __pte_offset_map_lock() verify pmdval unchanged after getting the lock, trying again if it changed. No #ifdef CONFIG_TRANSPARENT_HUGEPAGE around them: that could be done to cover the imminent case, but we expect to generalize it later, and it makes a mess of where to do the pmd_bad() clearing. Add pte_offset_map_nolock(): outputs ptl like pte_offset_map_lock(), without actually taking the lock. This will be preferred to open uses of pte_lockptr(), because (when split ptlock is in page table's struct page) it points to the right lock for the returned pte pointer, even if *pmd gets changed racily afterwards. Update corresponding Documentation. Do not add the anticipated rcu_read_lock() and rcu_read_unlock()s yet: they have to wait until all architectures are balancing pte_offset_map()s with pte_unmap()s (as in the arch series posted earlier). But comment where they will go, so that it's easy to add them for experiments. And only when those are in place can transient racy failure cases be enabled. Add more safety for the PAE mismatched pmd_low pmd_high case at that time. Link: https://lkml.kernel.org/r/2929bfd-9893-a374-e463-4c3127ff9b9d@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Alistair Popple <apopple@nvidia.com> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Christoph Hellwig <hch@infradead.org> Cc: David Hildenbrand <david@redhat.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Ira Weiny <ira.weiny@intel.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: SeongJae Park <sj@kernel.org> Cc: Song Liu <song@kernel.org> Cc: Steven Price <steven.price@arm.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com> Cc: Will Deacon <will@kernel.org> Cc: Yang Shi <shy828301@gmail.com> Cc: Yu Zhao <yuzhao@google.com> Cc: Zack Rusin <zackr@vmware.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/pgtable-generic.c')
-rw-r--r--mm/pgtable-generic.c56
1 files changed, 56 insertions, 0 deletions
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index d2fc52bffafc..c7ab18a5fb77 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,6 +10,8 @@
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/mm_inline.h>
#include <asm/tlb.h>
@@ -229,3 +231,57 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
+{
+ pmd_t pmdval;
+
+ /* rcu_read_lock() to be added later */
+ pmdval = pmdp_get_lockless(pmd);
+ if (pmdvalp)
+ *pmdvalp = pmdval;
+ if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
+ goto nomap;
+ if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
+ goto nomap;
+ if (unlikely(pmd_bad(pmdval))) {
+ pmd_clear_bad(pmd);
+ goto nomap;
+ }
+ return __pte_map(&pmdval, addr);
+nomap:
+ /* rcu_read_unlock() to be added later */
+ return NULL;
+}
+
+pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp)
+{
+ pmd_t pmdval;
+ pte_t *pte;
+
+ pte = __pte_offset_map(pmd, addr, &pmdval);
+ if (likely(pte))
+ *ptlp = pte_lockptr(mm, &pmdval);
+ return pte;
+}
+
+pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, spinlock_t **ptlp)
+{
+ spinlock_t *ptl;
+ pmd_t pmdval;
+ pte_t *pte;
+again:
+ pte = __pte_offset_map(pmd, addr, &pmdval);
+ if (unlikely(!pte))
+ return pte;
+ ptl = pte_lockptr(mm, &pmdval);
+ spin_lock(ptl);
+ if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
+ *ptlp = ptl;
+ return pte;
+ }
+ pte_unmap_unlock(pte, ptl);
+ goto again;
+}