summaryrefslogtreecommitdiff
path: root/mm/rmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c1279
1 files changed, 777 insertions, 502 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index 3746a5531018..1320b88fab74 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,7 +23,7 @@
* inode->i_rwsem (while writing or truncating, not reading or faulting)
* mm->mmap_lock
* mapping->invalidate_lock (in filemap_fault)
- * page->flags PG_locked (lock_page)
+ * folio_lock
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
* vma_start_write
* mapping->i_mmap_rwsem
@@ -32,7 +32,6 @@
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in block_dirty_folio)
- * folio_lock_memcg move_lock (in block_dirty_folio)
* i_pages lock (widely used)
* lruvec->lru_lock (in folio_lruvec_lock_irq)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
@@ -50,7 +49,7 @@
* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* vma_lock (hugetlb specific lock for pmd_sharing)
* mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
- * page->flags PG_locked (lock_page)
+ * folio_lock
*/
#include <linux/mm.h>
@@ -75,6 +74,7 @@
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/mm_inline.h>
+#include <linux/oom.h>
#include <asm/tlbflush.h>
@@ -182,8 +182,6 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* for the new allocation. At the same time, we do not want
* to do any locking for the common case of already having
* an anon_vma.
- *
- * This must be called with the mmap_lock held for reading.
*/
int __anon_vma_prepare(struct vm_area_struct *vma)
{
@@ -191,6 +189,7 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
struct anon_vma *anon_vma, *allocated;
struct anon_vma_chain *avc;
+ mmap_assert_locked(mm);
might_sleep();
avc = anon_vma_chain_alloc(GFP_KERNEL);
@@ -497,7 +496,7 @@ void __init anon_vma_init(void)
* concurrently without folio lock protection). See folio_lock_anon_vma_read()
* which has already covered that, and comment above remap_pages().
*/
-struct anon_vma *folio_get_anon_vma(struct folio *folio)
+struct anon_vma *folio_get_anon_vma(const struct folio *folio)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
@@ -541,7 +540,7 @@ out:
* reference like with folio_get_anon_vma() and then block on the mutex
* on !rwc->try_lock case.
*/
-struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
+struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma = NULL;
@@ -673,7 +672,7 @@ void try_to_unmap_flush_dirty(void)
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
- unsigned long uaddr)
+ unsigned long start, unsigned long end)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
int batch;
@@ -682,7 +681,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
if (!pte_accessible(mm, pteval))
return;
- arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr);
+ arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end);
tlb_ubc->flush_required = true;
/*
@@ -758,7 +757,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
- unsigned long uaddr)
+ unsigned long start, unsigned long end)
{
}
@@ -768,21 +767,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
-/*
- * At what user virtual address is page expected in vma?
- * Caller should check the page is actually part of the vma.
+/**
+ * page_address_in_vma - The virtual address of a page in this VMA.
+ * @folio: The folio containing the page.
+ * @page: The page within the folio.
+ * @vma: The VMA we need to know the address in.
+ *
+ * Calculates the user virtual address of this page in the specified VMA.
+ * It is the caller's responsibility to check the page is actually
+ * within the VMA. There may not currently be a PTE pointing at this
+ * page, but if a page fault occurs at this address, this is the page
+ * which will be accessed.
+ *
+ * Context: Caller should hold a reference to the folio. Caller should
+ * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the
+ * VMA from being altered.
+ *
+ * Return: The virtual address corresponding to this page in the VMA.
*/
-unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
+unsigned long page_address_in_vma(const struct folio *folio,
+ const struct page *page, const struct vm_area_struct *vma)
{
- struct folio *folio = page_folio(page);
if (folio_test_anon(folio)) {
- struct anon_vma *page__anon_vma = folio_anon_vma(folio);
+ struct anon_vma *anon_vma = folio_anon_vma(folio);
/*
* Note: swapoff's unuse_vma() is more efficient with this
* check, and needs it to match anon_vma when KSM is active.
*/
- if (!vma->anon_vma || !page__anon_vma ||
- vma->anon_vma->root != page__anon_vma->root)
+ if (!vma->anon_vma || !anon_vma ||
+ vma->anon_vma->root != anon_vma->root)
return -EFAULT;
} else if (!vma->vm_file) {
return -EFAULT;
@@ -790,7 +803,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return -EFAULT;
}
- return vma_address(page, vma);
+ /* KSM folios don't reach here because of the !anon_vma check */
+ return vma_address(vma, page_pgoff(folio, page), 1);
}
/*
@@ -867,13 +881,24 @@ static bool folio_referenced_one(struct folio *folio,
continue;
}
- if (pvmw.pte) {
- if (lru_gen_enabled() &&
- pte_young(ptep_get(pvmw.pte))) {
- lru_gen_look_around(&pvmw);
- referenced++;
- }
+ /*
+ * Skip the non-shared swapbacked folio mapped solely by
+ * the exiting or OOM-reaped process. This avoids redundant
+ * swap-out followed by an immediate unmap.
+ */
+ if ((!atomic_read(&vma->vm_mm->mm_users) ||
+ check_stable_address_space(vma->vm_mm)) &&
+ folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_maybe_mapped_shared(folio)) {
+ pra->referenced = -1;
+ page_vma_mapped_walk_done(&pvmw);
+ return false;
+ }
+ if (lru_gen_enabled() && pvmw.pte) {
+ if (lru_gen_look_around(&pvmw))
+ referenced++;
+ } else if (pvmw.pte) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte))
referenced++;
@@ -961,7 +986,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
int folio_referenced(struct folio *folio, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags)
{
- int we_locked = 0;
+ bool we_locked = false;
struct folio_referenced_arg pra = {
.mapcount = folio_mapcount(folio),
.memcg = memcg,
@@ -1019,6 +1044,14 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
pte_t *pte = pvmw->pte;
pte_t entry = ptep_get(pte);
+ /*
+ * PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages are clean and not writable from a
+ * CPU perspective. The MMU notifier takes care of any
+ * device aspects.
+ */
+ if (!pte_present(entry))
+ continue;
if (!pte_dirty(entry) && !pte_write(entry))
continue;
@@ -1102,6 +1135,80 @@ int folio_mkclean(struct folio *folio)
}
EXPORT_SYMBOL_GPL(folio_mkclean);
+struct wrprotect_file_state {
+ int cleaned;
+ pgoff_t pgoff;
+ unsigned long pfn;
+ unsigned long nr_pages;
+};
+
+static bool mapping_wrprotect_range_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long address, void *arg)
+{
+ struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
+ struct page_vma_mapped_walk pvmw = {
+ .pfn = state->pfn,
+ .nr_pages = state->nr_pages,
+ .pgoff = state->pgoff,
+ .vma = vma,
+ .address = address,
+ .flags = PVMW_SYNC,
+ };
+
+ state->cleaned += page_vma_mkclean_one(&pvmw);
+
+ return true;
+}
+
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+ pgoff_t pgoff_start, unsigned long nr_pages,
+ struct rmap_walk_control *rwc, bool locked);
+
+/**
+ * mapping_wrprotect_range() - Write-protect all mappings in a specified range.
+ *
+ * @mapping: The mapping whose reverse mapping should be traversed.
+ * @pgoff: The page offset at which @pfn is mapped within @mapping.
+ * @pfn: The PFN of the page mapped in @mapping at @pgoff.
+ * @nr_pages: The number of physically contiguous base pages spanned.
+ *
+ * Traverses the reverse mapping, finding all VMAs which contain a shared
+ * mapping of the pages in the specified range in @mapping, and write-protects
+ * them (that is, updates the page tables to mark the mappings read-only such
+ * that a write protection fault arises when the mappings are written to).
+ *
+ * The @pfn value need not refer to a folio, but rather can reference a kernel
+ * allocation which is mapped into userland. We therefore do not require that
+ * the page maps to a folio with a valid mapping or index field, rather the
+ * caller specifies these in @mapping and @pgoff.
+ *
+ * Return: the number of write-protected PTEs, or an error.
+ */
+int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
+ unsigned long pfn, unsigned long nr_pages)
+{
+ struct wrprotect_file_state state = {
+ .cleaned = 0,
+ .pgoff = pgoff,
+ .pfn = pfn,
+ .nr_pages = nr_pages,
+ };
+ struct rmap_walk_control rwc = {
+ .arg = (void *)&state,
+ .rmap_one = mapping_wrprotect_range_one,
+ .invalid_vma = invalid_mkclean_vma,
+ };
+
+ if (!mapping)
+ return 0;
+
+ __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
+ /* locked = */false);
+
+ return state.cleaned;
+}
+EXPORT_SYMBOL_GPL(mapping_wrprotect_range);
+
/**
* pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
* [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
@@ -1128,64 +1235,75 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
if (invalid_mkclean_vma(vma, NULL))
return 0;
- pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma);
+ pvmw.address = vma_address(vma, pgoff, nr_pages);
VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
return page_vma_mkclean_one(&pvmw);
}
-int folio_total_mapcount(struct folio *folio)
-{
- int mapcount = folio_entire_mapcount(folio);
- int nr_pages;
- int i;
-
- /* In the common case, avoid the loop when no pages mapped by PTE */
- if (folio_nr_pages_mapped(folio) == 0)
- return mapcount;
- /*
- * Add all the PTE mappings of those pages mapped by PTE.
- * Limit the loop to folio_nr_pages_mapped()?
- * Perhaps: given all the raciness, that may be a good or a bad idea.
- */
- nr_pages = folio_nr_pages(folio);
- for (i = 0; i < nr_pages; i++)
- mapcount += atomic_read(&folio_page(folio, i)->_mapcount);
-
- /* But each of those _mapcounts was based on -1 */
- mapcount += nr_pages;
- return mapcount;
-}
-
static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
- struct page *page, int nr_pages, enum rmap_level level,
- int *nr_pmdmapped)
+ struct page *page, int nr_pages, struct vm_area_struct *vma,
+ enum rmap_level level, int *nr_pmdmapped)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
- int first, nr = 0;
+ const int orig_nr_pages = nr_pages;
+ int first = 0, nr = 0;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
case RMAP_LEVEL_PTE:
- do {
- first = atomic_inc_and_test(&page->_mapcount);
- if (first && folio_test_large(folio)) {
- first = atomic_inc_return_relaxed(mapped);
- first = (first < ENTIRELY_MAPPED);
- }
+ if (!folio_test_large(folio)) {
+ nr = atomic_inc_and_test(&folio->_mapcount);
+ break;
+ }
- if (first)
- nr++;
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma);
+ if (nr == orig_nr_pages)
+ /* Was completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ else
+ nr = 0;
+ break;
+ }
+
+ do {
+ first += atomic_inc_and_test(&page->_mapcount);
} while (page++, --nr_pages > 0);
+
+ if (first &&
+ atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
+ nr = first;
+
+ folio_add_large_mapcount(folio, orig_nr_pages, vma);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
first = atomic_inc_and_test(&folio->_entire_mapcount);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ if (level == RMAP_LEVEL_PMD && first)
+ *nr_pmdmapped = folio_large_nr_pages(folio);
+ nr = folio_inc_return_large_mapcount(folio, vma);
+ if (nr == 1)
+ /* Was completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ else
+ nr = 0;
+ break;
+ }
+
if (first) {
nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
- *nr_pmdmapped = folio_nr_pages(folio);
- nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ nr_pages = folio_large_nr_pages(folio);
+ /*
+ * We only track PMD mappings of PMD-sized
+ * folios separately.
+ */
+ if (level == RMAP_LEVEL_PMD)
+ *nr_pmdmapped = nr_pages;
+ nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of a remove and another add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1194,6 +1312,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
nr = 0;
}
}
+ folio_inc_large_mapcount(folio, vma);
break;
}
return nr;
@@ -1263,8 +1382,9 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*/
-static void __page_check_anon_rmap(struct folio *folio, struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+static void __page_check_anon_rmap(const struct folio *folio,
+ const struct page *page, struct vm_area_struct *vma,
+ unsigned long address)
{
/*
* The page's anon-rmap details (mapping and index) are guaranteed to
@@ -1279,37 +1399,46 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page,
*/
VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
folio);
- VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
+ VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
page);
}
+static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
+{
+ int idx;
+
+ if (nr) {
+ idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
+ __lruvec_stat_mod_folio(folio, idx, nr);
+ }
+ if (nr_pmdmapped) {
+ if (folio_test_anon(folio)) {
+ idx = NR_ANON_THPS;
+ __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
+ } else {
+ /* NR_*_PMDMAPPED are not maintained per-memcg */
+ idx = folio_test_swapbacked(folio) ?
+ NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
+ __mod_node_page_state(folio_pgdat(folio), idx,
+ nr_pmdmapped);
+ }
+ }
+}
+
static __always_inline void __folio_add_anon_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
unsigned long address, rmap_t flags, enum rmap_level level)
{
int i, nr, nr_pmdmapped = 0;
- nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
- if (nr_pmdmapped)
- __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped);
- if (nr)
- __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
+ VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
- if (unlikely(!folio_test_anon(folio))) {
- VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
- /*
- * For a PTE-mapped large folio, we only know that the single
- * PTE is exclusive. Further, __folio_set_anon() might not get
- * folio->index right when not given the address of the head
- * page.
- */
- VM_WARN_ON_FOLIO(folio_test_large(folio) &&
- level != RMAP_LEVEL_PMD, folio);
- __folio_set_anon(folio, vma, address,
- !!(flags & RMAP_EXCLUSIVE));
- } else if (likely(!folio_test_ksm(folio))) {
+ nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
+
+ if (likely(!folio_test_ksm(folio)))
__page_check_anon_rmap(folio, page, vma, address);
- }
+
+ __folio_mod_stat(folio, nr, nr_pmdmapped);
if (flags & RMAP_EXCLUSIVE) {
switch (level) {
@@ -1320,15 +1449,32 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
case RMAP_LEVEL_PMD:
SetPageAnonExclusive(page);
break;
+ case RMAP_LEVEL_PUD:
+ /*
+ * Keep the compiler happy, we don't support anonymous
+ * PUD mappings.
+ */
+ WARN_ON_ONCE(1);
+ break;
}
}
+
+ VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
+ atomic_read(&folio->_mapcount) > 0, folio);
for (i = 0; i < nr_pages; i++) {
struct page *cur_page = page + i;
- /* While PTE-mapping a THP we have a PMD and a PTE mapping. */
- VM_WARN_ON_FOLIO((atomic_read(&cur_page->_mapcount) > 0 ||
- (folio_test_large(folio) &&
- folio_entire_mapcount(folio) > 1)) &&
+ VM_WARN_ON_FOLIO(folio_test_large(folio) &&
+ folio_entire_mapcount(folio) > 1 &&
+ PageAnonExclusive(cur_page), folio);
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
+ continue;
+
+ /*
+ * While PTE-mapping a THP we have a PMD and a PTE
+ * mapping.
+ */
+ VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 &&
PageAnonExclusive(cur_page), folio);
}
@@ -1395,50 +1541,72 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
* @folio: The folio to add the mapping to.
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @flags: The rmap flags
*
* Like folio_add_anon_rmap_*() but must only be called on *new* folios.
* This means the inc-and-test can be bypassed.
- * The folio does not have to be locked.
+ * The folio doesn't necessarily need to be locked while it's exclusive
+ * unless two threads map it concurrently. However, the folio must be
+ * locked if it's shared.
*
- * If the folio is pmd-mappable, it is accounted as a THP. As the folio
- * is new, it's assumed to be mapped exclusively by a single process.
+ * If the folio is pmd-mappable, it is accounted as a THP.
*/
void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address, rmap_t flags)
{
- int nr = folio_nr_pages(folio);
+ const bool exclusive = flags & RMAP_EXCLUSIVE;
+ int nr = 1, nr_pmdmapped = 0;
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
- VM_BUG_ON_VMA(address < vma->vm_start ||
- address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
- __folio_set_swapbacked(folio);
- __folio_set_anon(folio, vma, address, true);
+ VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);
+
+ /*
+ * VM_DROPPABLE mappings don't swap; instead they're just dropped when
+ * under memory pressure.
+ */
+ if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
+ __folio_set_swapbacked(folio);
+ __folio_set_anon(folio, vma, address, exclusive);
if (likely(!folio_test_large(folio))) {
/* increment count (starts at -1) */
atomic_set(&folio->_mapcount, 0);
- SetPageAnonExclusive(&folio->page);
+ if (exclusive)
+ SetPageAnonExclusive(&folio->page);
} else if (!folio_test_pmd_mappable(folio)) {
int i;
+ nr = folio_large_nr_pages(folio);
for (i = 0; i < nr; i++) {
struct page *page = folio_page(folio, i);
- /* increment count (starts at -1) */
- atomic_set(&page->_mapcount, 0);
- SetPageAnonExclusive(page);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
+ if (exclusive)
+ SetPageAnonExclusive(page);
}
- atomic_set(&folio->_nr_pages_mapped, nr);
+ folio_set_large_mapcount(folio, nr, vma);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, nr);
} else {
+ nr = folio_large_nr_pages(folio);
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
- atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
- SetPageAnonExclusive(&folio->page);
- __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
+ folio_set_large_mapcount(folio, 1, vma);
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
+ if (exclusive)
+ SetPageAnonExclusive(&folio->page);
+ nr_pmdmapped = nr;
}
- __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr);
+ VM_WARN_ON_ONCE(address < vma->vm_start ||
+ address + (nr << PAGE_SHIFT) > vma->vm_end);
+
+ __folio_mod_stat(folio, nr, nr_pmdmapped);
+ mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
}
static __always_inline void __folio_add_file_rmap(struct folio *folio,
@@ -1449,12 +1617,8 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio,
VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
- nr = __folio_add_rmap(folio, page, nr_pages, level, &nr_pmdmapped);
- if (nr_pmdmapped)
- __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ?
- NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped);
- if (nr)
- __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr);
+ nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
+ __folio_mod_stat(folio, nr, nr_pmdmapped);
/* See comments in folio_add_anon_rmap_*() */
if (!folio_test_large(folio))
@@ -1498,36 +1662,95 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
+ * @folio: The folio to add the mapping to
+ * @page: The first page to add
+ * @vma: The vm area in which the mapping is added
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
static __always_inline void __folio_remove_rmap(struct folio *folio,
struct page *page, int nr_pages, struct vm_area_struct *vma,
enum rmap_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
- int last, nr = 0, nr_pmdmapped = 0;
- enum node_stat_item idx;
+ int last = 0, nr = 0, nr_pmdmapped = 0;
+ bool partially_mapped = false;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
case RMAP_LEVEL_PTE:
- do {
- last = atomic_add_negative(-1, &page->_mapcount);
- if (last && folio_test_large(folio)) {
- last = atomic_dec_return_relaxed(mapped);
- last = (last < ENTIRELY_MAPPED);
+ if (!folio_test_large(folio)) {
+ nr = atomic_add_negative(-1, &folio->_mapcount);
+ break;
+ }
+
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
+ if (!nr) {
+ /* Now completely unmapped. */
+ nr = folio_nr_pages(folio);
+ } else {
+ partially_mapped = nr < folio_large_nr_pages(folio) &&
+ !folio_entire_mapcount(folio);
+ nr = 0;
}
+ break;
+ }
- if (last)
- nr++;
+ folio_sub_large_mapcount(folio, nr_pages, vma);
+ do {
+ last += atomic_add_negative(-1, &page->_mapcount);
} while (page++, --nr_pages > 0);
+
+ if (last &&
+ atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED)
+ nr = last;
+
+ partially_mapped = nr && atomic_read(mapped);
break;
case RMAP_LEVEL_PMD:
+ case RMAP_LEVEL_PUD:
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ last = atomic_add_negative(-1, &folio->_entire_mapcount);
+ if (level == RMAP_LEVEL_PMD && last)
+ nr_pmdmapped = folio_large_nr_pages(folio);
+ nr = folio_dec_return_large_mapcount(folio, vma);
+ if (!nr) {
+ /* Now completely unmapped. */
+ nr = folio_large_nr_pages(folio);
+ } else {
+ partially_mapped = last &&
+ nr < folio_large_nr_pages(folio);
+ nr = 0;
+ }
+ break;
+ }
+
+ folio_dec_large_mapcount(folio, vma);
last = atomic_add_negative(-1, &folio->_entire_mapcount);
if (last) {
nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
if (likely(nr < ENTIRELY_MAPPED)) {
- nr_pmdmapped = folio_nr_pages(folio);
- nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+ nr_pages = folio_large_nr_pages(folio);
+ if (level == RMAP_LEVEL_PMD)
+ nr_pmdmapped = nr_pages;
+ nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
/* Raced ahead of another remove and an add? */
if (unlikely(nr < 0))
nr = 0;
@@ -1536,31 +1759,22 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
nr = 0;
}
}
+
+ partially_mapped = nr && nr < nr_pmdmapped;
break;
}
- if (nr_pmdmapped) {
- if (folio_test_anon(folio))
- idx = NR_ANON_THPS;
- else if (folio_test_swapbacked(folio))
- idx = NR_SHMEM_PMDMAPPED;
- else
- idx = NR_FILE_PMDMAPPED;
- __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped);
- }
- if (nr) {
- idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
- __lruvec_stat_mod_folio(folio, idx, -nr);
+ /*
+ * Queue anon large folio for deferred split if at least one page of
+ * the folio is unmapped and at least one page is still mapped.
+ *
+ * Check partially_mapped first to ensure it is a large folio.
+ */
+ if (partially_mapped && folio_test_anon(folio) &&
+ !folio_test_partially_mapped(folio))
+ deferred_split_folio(folio, true);
- /*
- * Queue anon large folio for deferred split if at least one
- * page of the folio is unmapped and at least one page
- * is still mapped.
- */
- if (folio_test_large(folio) && folio_test_anon(folio))
- if (level == RMAP_LEVEL_PTE || nr < nr_pmdmapped)
- deferred_split_folio(folio);
- }
+ __folio_mod_stat(folio, -nr, -nr_pmdmapped);
/*
* It would be tidy to reset folio_test_anon mapping when fully
@@ -1610,6 +1824,55 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
#endif
}
+/**
+ * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
+ * @folio: The folio to remove the mapping from
+ * @page: The first page to remove
+ * @vma: The vm area from which the mapping is removed
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_remove_rmap_pud(struct folio *folio, struct page *page,
+ struct vm_area_struct *vma)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+ WARN_ON_ONCE(true);
+#endif
+}
+
+static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
+ struct page_vma_mapped_walk *pvmw,
+ enum ttu_flags flags, pte_t pte)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ unsigned long end_addr, addr = pvmw->address;
+ struct vm_area_struct *vma = pvmw->vma;
+ unsigned int max_nr;
+
+ if (flags & TTU_HWPOISON)
+ return 1;
+ if (!folio_test_large(folio))
+ return 1;
+
+ /* We may only batch within a single VMA and a single page table. */
+ end_addr = pmd_addr_end(addr, vma->vm_end);
+ max_nr = (end_addr - addr) >> PAGE_SHIFT;
+
+ /* We only support lazyfree batching for now ... */
+ if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
+ return 1;
+ if (pte_unused(pte))
+ return 1;
+
+ return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags,
+ NULL, NULL, NULL);
+}
+
/*
* @arg: enum ttu_flags will be passed to this argument
*/
@@ -1618,11 +1881,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ bool anon_exclusive, ret = true;
pte_t pteval;
struct page *subpage;
- bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ unsigned long nr_pages = 1, end_addr;
unsigned long pfn;
unsigned long hsz = 0;
@@ -1635,9 +1899,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
- if (flags & TTU_SPLIT_HUGE_PMD)
- split_huge_pmd_address(vma, address, false, folio);
-
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
@@ -1663,9 +1924,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_FOLIO(!pvmw.pte, folio);
-
/*
* If the folio is in an mlock()d vma, we must not swap it out.
*/
@@ -1674,12 +1932,49 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* Restore the mlock which got missed */
if (!folio_test_large(folio))
mlock_vma_folio(folio, vma);
- page_vma_mapped_walk_done(&pvmw);
- ret = false;
- break;
+ goto walk_abort;
+ }
+
+ if (!pvmw.pte) {
+ if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
+ if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
+ goto walk_done;
+ /*
+ * unmap_huge_pmd_locked has either already marked
+ * the folio as swap-backed or decided to retain it
+ * due to GUP or speculative references.
+ */
+ goto walk_abort;
+ }
+
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ /*
+ * We temporarily have to drop the PTL and
+ * restart so we can process the PTE-mapped THP.
+ */
+ split_huge_pmd_locked(vma, pvmw.address,
+ pvmw.pmd, false);
+ flags &= ~TTU_SPLIT_HUGE_PMD;
+ page_vma_mapped_walk_restart(&pvmw);
+ continue;
+ }
+ }
+
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_FOLIO(!pvmw.pte, folio);
+
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ if (likely(pte_present(pteval))) {
+ pfn = pte_pfn(pteval);
+ } else {
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
}
- pfn = pte_pfn(ptep_get(pvmw.pte));
subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
@@ -1714,11 +2009,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
if (!anon) {
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
- if (!hugetlb_vma_trylock_write(vma)) {
- page_vma_mapped_walk_done(&pvmw);
- ret = false;
- break;
- }
+ if (!hugetlb_vma_trylock_write(vma))
+ goto walk_abort;
if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
flush_tlb_range(vma,
@@ -1733,30 +2025,36 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* actual page and drop map count
* to zero.
*/
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_done;
}
hugetlb_vma_unlock_write(vma);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
- } else {
- flush_cache_page(vma, address, pfn);
- /* Nuke the page table entry. */
- if (should_defer_flush(mm, flags)) {
- /*
- * We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the folio.
- * If the entry was previously clean then the
- * architecture must guarantee that a clear->dirty
- * transition on a cached TLB entry is written through
- * and traps if the PTE is unmapped.
- */
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ } else if (likely(pte_present(pteval))) {
+ nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval);
+ end_addr = address + nr_pages * PAGE_SIZE;
+ flush_cache_range(vma, address, end_addr);
- set_tlb_ubc_flush_pending(mm, pteval, address);
- } else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
- }
+ /* Nuke the page table entry. */
+ pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ if (should_defer_flush(mm, flags))
+ set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
+ else
+ flush_tlb_range(vma, address, end_addr);
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ } else {
+ pte_clear(mm, address, pvmw.pte);
}
/*
@@ -1766,10 +2064,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
-
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
@@ -1783,8 +2077,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
-
- } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+ !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
@@ -1806,9 +2100,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
if (unlikely(folio_test_swapbacked(folio) !=
folio_test_swapcache(folio))) {
WARN_ON_ONCE(1);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_abort;
}
/* MADV_FREE page check */
@@ -1831,39 +2123,45 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
smp_rmb();
- /*
- * The only page refs must be one from isolation
- * plus the rmap(s) (dropped by discard:).
- */
- if (ref_count == 1 + map_count &&
- !folio_test_dirty(folio)) {
- dec_mm_counter(mm, MM_ANONPAGES);
- goto discard;
+ if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
+ /*
+ * redirtied either using the page table or a previously
+ * obtained GUP reference.
+ */
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
+ folio_set_swapbacked(folio);
+ goto walk_abort;
+ } else if (ref_count != 1 + map_count) {
+ /*
+ * Additional reference. Could be a GUP reference or any
+ * speculative reference. GUP users must mark the folio
+ * dirty if there was a modification. This folio cannot be
+ * reclaimed right now either way, so act just like nothing
+ * happened.
+ * We'll come back here later and detect if the folio was
+ * dirtied when the additional reference is gone.
+ */
+ set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
+ goto walk_abort;
}
-
- /*
- * If the folio was redirtied, it cannot be
- * discarded. Remap the page to page table.
- */
- set_pte_at(mm, address, pvmw.pte, pteval);
- folio_set_swapbacked(folio);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
+ goto discard;
}
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_abort;
}
+
+ /*
+ * arch_unmap_one() is expected to be a NOP on
+ * architectures where we could have PFN swap PTEs,
+ * so we'll not check/care.
+ */
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_abort;
}
/* See folio_try_share_anon_rmap(): clear PTE first. */
@@ -1871,9 +2169,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
folio_try_share_anon_rmap_pte(folio, subpage)) {
swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
+ goto walk_abort;
}
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
@@ -1886,10 +2182,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
swp_pte = swp_entry_to_pte(entry);
if (anon_exclusive)
swp_pte = pte_swp_mkexclusive(swp_pte);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (likely(pte_present(pteval))) {
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
set_pte_at(mm, address, pvmw.pte, swp_pte);
} else {
/*
@@ -1906,13 +2209,27 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter_file(folio));
}
discard:
- if (unlikely(folio_test_hugetlb(folio)))
+ if (unlikely(folio_test_hugetlb(folio))) {
hugetlb_remove_rmap(folio);
- else
- folio_remove_rmap_pte(folio, subpage, vma);
+ } else {
+ folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
+ }
if (vma->vm_flags & VM_LOCKED)
mlock_drain_local();
- folio_put(folio);
+ folio_put_refs(folio, nr_pages);
+
+ /*
+ * If we are sure that we batched the entire folio and cleared
+ * all PTEs, we can just optimize and stop right here.
+ */
+ if (nr_pages == folio_nr_pages(folio))
+ goto walk_done;
+ continue;
+walk_abort:
+ ret = false;
+walk_done:
+ page_vma_mapped_walk_done(&pvmw);
+ break;
}
mmu_notifier_invalidate_range_end(&range);
@@ -1967,9 +2284,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+ bool anon_exclusive, writable, ret = true;
pte_t pteval;
struct page *subpage;
- bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
unsigned long pfn;
@@ -1985,13 +2302,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
pvmw.flags = PVMW_SYNC;
/*
- * unmap_page() in mm/huge_memory.c is the only user of migration with
- * TTU_SPLIT_HUGE_PMD and it wants to freeze.
- */
- if (flags & TTU_SPLIT_HUGE_PMD)
- split_huge_pmd_address(vma, address, true, folio);
-
- /*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
@@ -2016,9 +2326,16 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_locked(vma, pvmw.address,
+ pvmw.pmd, true);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
subpage = folio_page(folio,
pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
@@ -2030,30 +2347,25 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
continue;
- }
#endif
+ }
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- pfn = pte_pfn(ptep_get(pvmw.pte));
-
- if (folio_is_zone_device(folio)) {
- /*
- * Our PTE is a non-present device exclusive entry and
- * calculating the subpage as for the common case would
- * result in an invalid pointer.
- *
- * Since only PAGE_SIZE pages can currently be
- * migrated, just set it to page. This will need to be
- * changed when hugepage migrations to device private
- * memory are supported.
- */
- VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
- subpage = &folio->page;
+ /*
+ * Handle PFN swap PTEs, such as device-exclusive ones, that
+ * actually map pages.
+ */
+ pteval = ptep_get(pvmw.pte);
+ if (likely(pte_present(pteval))) {
+ pfn = pte_pfn(pteval);
} else {
- subpage = folio_page(folio, pfn - folio_pfn(folio));
+ pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
}
+
+ subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
@@ -2109,7 +2421,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
- } else {
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ writable = pte_write(pteval);
+ } else if (likely(pte_present(pteval))) {
flush_cache_page(vma, address, pfn);
/* Nuke the page table entry. */
if (should_defer_flush(mm, flags)) {
@@ -2123,58 +2438,27 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pteval, address);
+ set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
} else {
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
+ if (pte_dirty(pteval))
+ folio_mark_dirty(folio);
+ writable = pte_write(pteval);
+ } else {
+ pte_clear(mm, address, pvmw.pte);
+ writable = is_writable_device_private_entry(pte_to_swp_entry(pteval));
}
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
+ VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
+ !anon_exclusive, folio);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (folio_is_device_private(folio)) {
- unsigned long pfn = folio_pfn(folio);
- swp_entry_t entry;
- pte_t swp_pte;
-
- if (anon_exclusive)
- WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
- subpage));
+ if (PageHWPoison(subpage)) {
+ VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = pte_to_swp_entry(pteval);
- if (is_writable_device_private_entry(entry))
- entry = make_writable_migration_entry(pfn);
- else if (anon_exclusive)
- entry = make_readable_exclusive_migration_entry(pfn);
- else
- entry = make_readable_migration_entry(pfn);
- swp_pte = swp_entry_to_pte(entry);
-
- /*
- * pteval maps a zone device page and is therefore
- * a swap pte.
- */
- if (pte_swp_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
- trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
- folio_order(folio));
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- */
- } else if (PageHWPoison(subpage)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
@@ -2184,8 +2468,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, mm_counter(folio));
set_pte_at(mm, address, pvmw.pte, pteval);
}
-
- } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+ !userfaultfd_armed(vma)) {
/*
* The guest indicated that the page content is of no
* interest anymore. Simply discard the pte, vmscan
@@ -2201,6 +2485,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t swp_pte;
+ /*
+ * arch_unmap_one() is expected to be a NOP on
+ * architectures where we could have PFN swap PTEs,
+ * so we'll not check/care.
+ */
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte,
@@ -2211,8 +2500,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
- VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
- !anon_exclusive, subpage);
/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
if (folio_test_hugetlb(folio)) {
@@ -2237,7 +2524,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
- if (pte_write(pteval))
+ if (writable)
entry = make_writable_migration_entry(
page_to_pfn(subpage));
else if (anon_exclusive)
@@ -2246,15 +2533,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
- if (pte_young(pteval))
- entry = make_migration_entry_young(entry);
- if (pte_dirty(pteval))
- entry = make_migration_entry_dirty(entry);
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (likely(pte_present(pteval))) {
+ if (pte_young(pteval))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pteval))
+ entry = make_migration_entry_dirty(entry);
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
if (folio_test_hugetlb(folio))
set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
hsz);
@@ -2329,190 +2624,139 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
}
#ifdef CONFIG_DEVICE_PRIVATE
-struct make_exclusive_args {
- struct mm_struct *mm;
- unsigned long address;
- void *owner;
- bool valid;
-};
-
-static bool page_make_device_exclusive_one(struct folio *folio,
- struct vm_area_struct *vma, unsigned long address, void *priv)
+/**
+ * make_device_exclusive() - Mark a page for exclusive use by a device
+ * @mm: mm_struct of associated target process
+ * @addr: the virtual address to mark for exclusive device access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ * @foliop: folio pointer will be stored here on success.
+ *
+ * This function looks up the page mapped at the given address, grabs a
+ * folio reference, locks the folio and replaces the PTE with special
+ * device-exclusive PFN swap entry, preventing access through the process
+ * page tables. The function will return with the folio locked and referenced.
+ *
+ * On fault, the device-exclusive entries are replaced with the original PTE
+ * under folio lock, after calling MMU notifiers.
+ *
+ * Only anonymous non-hugetlb folios are supported and the VMA must have
+ * write permissions such that we can fault in the anonymous page writable
+ * in order to mark it exclusive. The caller must hold the mmap_lock in read
+ * mode.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the folio lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ *
+ * Notes:
+ * #. This function always operates on individual PTEs mapping individual
+ * pages. PMD-sized THPs are first remapped to be mapped by PTEs before
+ * the conversion happens on a single PTE corresponding to @addr.
+ * #. While concurrent access through the process page tables is prevented,
+ * concurrent access through other page references (e.g., earlier GUP
+ * invocation) is not handled and not supported.
+ * #. device-exclusive entries are considered "clean" and "old" by core-mm.
+ * Device drivers must update the folio state when informed by MMU
+ * notifiers.
+ *
+ * Returns: pointer to mapped page on success, otherwise a negative error.
+ */
+struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
+ void *owner, struct folio **foliop)
{
- struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
- struct make_exclusive_args *args = priv;
- pte_t pteval;
- struct page *subpage;
- bool ret = true;
struct mmu_notifier_range range;
+ struct folio *folio, *fw_folio;
+ struct vm_area_struct *vma;
+ struct folio_walk fw;
+ struct page *page;
swp_entry_t entry;
pte_t swp_pte;
- pte_t ptent;
+ int ret;
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
- vma->vm_mm, address, min(vma->vm_end,
- address + folio_size(folio)),
- args->owner);
- mmu_notifier_invalidate_range_start(&range);
-
- while (page_vma_mapped_walk(&pvmw)) {
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_FOLIO(!pvmw.pte, folio);
-
- ptent = ptep_get(pvmw.pte);
- if (!pte_present(ptent)) {
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
-
- subpage = folio_page(folio,
- pte_pfn(ptent) - folio_pfn(folio));
- address = pvmw.address;
-
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(ptent));
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
-
- /* Set the dirty flag on the folio now the pte is gone. */
- if (pte_dirty(pteval))
- folio_mark_dirty(folio);
-
- /*
- * Check that our target page is still mapped at the expected
- * address.
- */
- if (args->mm == mm && args->address == address &&
- pte_write(pteval))
- args->valid = true;
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- if (pte_write(pteval))
- entry = make_writable_device_exclusive_entry(
- page_to_pfn(subpage));
- else
- entry = make_readable_device_exclusive_entry(
- page_to_pfn(subpage));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ mmap_assert_locked(mm);
+ addr = PAGE_ALIGN_DOWN(addr);
- set_pte_at(mm, address, pvmw.pte, swp_pte);
-
- /*
- * There is a reference on the page for the swap entry which has
- * been removed, so shouldn't take another.
- */
- folio_remove_rmap_pte(folio, subpage, vma);
+ /*
+ * Fault in the page writable and try to lock it; note that if the
+ * address would already be marked for exclusive use by a device,
+ * the GUP call would undo that first by triggering a fault.
+ *
+ * If any other device would already map this page exclusively, the
+ * fault will trigger a conversion to an ordinary
+ * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
+ */
+retry:
+ page = get_user_page_vma_remote(mm, addr,
+ FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+ &vma);
+ if (IS_ERR(page))
+ return page;
+ folio = page_folio(page);
+
+ if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EOPNOTSUPP);
}
- mmu_notifier_invalidate_range_end(&range);
-
- return ret;
-}
-
-/**
- * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
- * @folio: The folio to replace page table entries for.
- * @mm: The mm_struct where the folio is expected to be mapped.
- * @address: Address where the folio is expected to be mapped.
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
- *
- * Tries to remove all the page table entries which are mapping this
- * folio and replace them with special device exclusive swap entries to
- * grant a device exclusive access to the folio.
- *
- * Context: Caller must hold the folio lock.
- * Return: false if the page is still mapped, or if it could not be unmapped
- * from the expected address. Otherwise returns true (success).
- */
-static bool folio_make_device_exclusive(struct folio *folio,
- struct mm_struct *mm, unsigned long address, void *owner)
-{
- struct make_exclusive_args args = {
- .mm = mm,
- .address = address,
- .owner = owner,
- .valid = false,
- };
- struct rmap_walk_control rwc = {
- .rmap_one = page_make_device_exclusive_one,
- .done = folio_not_mapped,
- .anon_lock = folio_lock_anon_vma_read,
- .arg = &args,
- };
+ ret = folio_lock_killable(folio);
+ if (ret) {
+ folio_put(folio);
+ return ERR_PTR(ret);
+ }
/*
- * Restrict to anonymous folios for now to avoid potential writeback
- * issues.
+ * Inform secondary MMUs that we are going to convert this PTE to
+ * device-exclusive, such that they unmap it now. Note that the
+ * caller must filter this event out to prevent livelocks.
*/
- if (!folio_test_anon(folio))
- return false;
-
- rmap_walk(folio, &rwc);
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+ mm, addr, addr + PAGE_SIZE, owner);
+ mmu_notifier_invalidate_range_start(&range);
- return args.valid && !folio_mapcount(folio);
-}
+ /*
+ * Let's do a second walk and make sure we still find the same page
+ * mapped writable. Note that any page of an anonymous folio can
+ * only be mapped writable using exactly one PTE ("exclusive"), so
+ * there cannot be other mappings.
+ */
+ fw_folio = folio_walk_start(&fw, vma, addr, 0);
+ if (fw_folio != folio || fw.page != page ||
+ fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
+ if (fw_folio)
+ folio_walk_end(&fw, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ folio_unlock(folio);
+ folio_put(folio);
+ goto retry;
+ }
-/**
- * make_device_exclusive_range() - Mark a range for exclusive use by a device
- * @mm: mm_struct of associated target process
- * @start: start of the region to mark for exclusive device access
- * @end: end address of region
- * @pages: returns the pages which were successfully marked for exclusive access
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
- *
- * Returns: number of pages found in the range by GUP. A page is marked for
- * exclusive access only if the page pointer is non-NULL.
- *
- * This function finds ptes mapping page(s) to the given address range, locks
- * them and replaces mappings with special swap entries preventing userspace CPU
- * access. On fault these entries are replaced with the original mapping after
- * calling MMU notifiers.
- *
- * A driver using this to program access from a device must use a mmu notifier
- * critical section to hold a device specific lock during programming. Once
- * programming is complete it should drop the page lock and reference after
- * which point CPU access to the page will revoke the exclusive access.
- */
-int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, struct page **pages,
- void *owner)
-{
- long npages = (end - start) >> PAGE_SHIFT;
- long i;
-
- npages = get_user_pages_remote(mm, start, npages,
- FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
- pages, NULL);
- if (npages < 0)
- return npages;
-
- for (i = 0; i < npages; i++, start += PAGE_SIZE) {
- struct folio *folio = page_folio(pages[i]);
- if (PageTail(pages[i]) || !folio_trylock(folio)) {
- folio_put(folio);
- pages[i] = NULL;
- continue;
- }
+ /* Nuke the page table entry so we get the uptodate dirty bit. */
+ flush_cache_page(vma, addr, page_to_pfn(page));
+ fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
- if (!folio_make_device_exclusive(folio, mm, start, owner)) {
- folio_unlock(folio);
- folio_put(folio);
- pages[i] = NULL;
- }
- }
+ /* Set the dirty flag on the folio now the PTE is gone. */
+ if (pte_dirty(fw.pte))
+ folio_mark_dirty(folio);
- return npages;
+ /*
+ * Store the pfn of the page in a special device-exclusive PFN swap PTE.
+ * do_swap_page() will trigger the conversion back while holding the
+ * folio lock.
+ */
+ entry = make_device_exclusive_entry(page_to_pfn(page));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(fw.pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ /* The pte is writable, uffd-wp does not apply. */
+ set_pte_at(mm, addr, fw.ptep, swp_pte);
+
+ folio_walk_end(&fw, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ *foliop = folio;
+ return page;
}
-EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif
void __put_anon_vma(struct anon_vma *anon_vma)
@@ -2524,7 +2768,7 @@ void __put_anon_vma(struct anon_vma *anon_vma)
anon_vma_free(root);
}
-static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
+static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
@@ -2588,7 +2832,8 @@ static void rmap_walk_anon(struct folio *folio,
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
- unsigned long address = vma_address(&folio->page, vma);
+ unsigned long address = vma_address(vma, pgoff_start,
+ folio_nr_pages(folio));
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2606,35 +2851,37 @@ static void rmap_walk_anon(struct folio *folio,
anon_vma_unlock_read(anon_vma);
}
-/*
- * rmap_walk_file - do something to file page using the object-based rmap method
- * @folio: the folio to be handled
- * @rwc: control variable according to each walk type
- * @locked: caller holds relevant rmap lock
+/**
+ * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
+ * of a page mapped within a specified page cache object at a specified offset.
*
- * Find all the mappings of a folio using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
+ * @folio: Either the folio whose mappings to traverse, or if NULL,
+ * the callbacks specified in @rwc will be configured such
+ * as to be able to look up mappings correctly.
+ * @mapping: The page cache object whose mapping VMAs we intend to
+ * traverse. If @folio is non-NULL, this should be equal to
+ * folio_mapping(folio).
+ * @pgoff_start: The offset within @mapping of the page which we are
+ * looking up. If @folio is non-NULL, this should be equal
+ * to folio_pgoff(folio).
+ * @nr_pages: The number of pages mapped by the mapping. If @folio is
+ * non-NULL, this should be equal to folio_nr_pages(folio).
+ * @rwc: The reverse mapping walk control object describing how
+ * the traversal should proceed.
+ * @locked: Is the @mapping already locked? If not, we acquire the
+ * lock.
*/
-static void rmap_walk_file(struct folio *folio,
- struct rmap_walk_control *rwc, bool locked)
+static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
+ pgoff_t pgoff_start, unsigned long nr_pages,
+ struct rmap_walk_control *rwc, bool locked)
{
- struct address_space *mapping = folio_mapping(folio);
- pgoff_t pgoff_start, pgoff_end;
+ pgoff_t pgoff_end = pgoff_start + nr_pages - 1;
struct vm_area_struct *vma;
- /*
- * The page lock not only makes sure that page->mapping cannot
- * suddenly be NULLified by truncation, it makes sure that the
- * structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_rwsem.
- */
- VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
-
- if (!mapping)
- return;
+ VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
+ VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
+ VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);
- pgoff_start = folio_pgoff(folio);
- pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
if (!locked) {
if (i_mmap_trylock_read(mapping))
goto lookup;
@@ -2649,7 +2896,7 @@ static void rmap_walk_file(struct folio *folio,
lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
- unsigned long address = vma_address(&folio->page, vma);
+ unsigned long address = vma_address(vma, pgoff_start, nr_pages);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2662,12 +2909,38 @@ lookup:
if (rwc->done && rwc->done(folio))
goto done;
}
-
done:
if (!locked)
i_mmap_unlock_read(mapping);
}
+/*
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @folio: the folio to be handled
+ * @rwc: control variable according to each walk type
+ * @locked: caller holds relevant rmap lock
+ *
+ * Find all the mappings of a folio using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ */
+static void rmap_walk_file(struct folio *folio,
+ struct rmap_walk_control *rwc, bool locked)
+{
+ /*
+ * The folio lock not only makes sure that folio->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the structure
+ * at mapping cannot be freed and reused yet, so we can safely take
+ * mapping->i_mmap_rwsem.
+ */
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (!folio->mapping)
+ return;
+
+ __rmap_walk_file(folio, folio->mapping, folio->index,
+ folio_nr_pages(folio), rwc, locked);
+}
+
void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
if (unlikely(folio_test_ksm(folio)))
@@ -2702,6 +2975,7 @@ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
atomic_inc(&folio->_entire_mapcount);
+ atomic_inc(&folio->_large_mapcount);
if (flags & RMAP_EXCLUSIVE)
SetPageAnonExclusive(&folio->page);
VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
@@ -2716,6 +2990,7 @@ void hugetlb_add_new_anon_rmap(struct folio *folio,
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
/* increment count (starts at -1) */
atomic_set(&folio->_entire_mapcount, 0);
+ atomic_set(&folio->_large_mapcount, 0);
folio_clear_hugetlb_restore_reserve(folio);
__folio_set_anon(folio, vma, address, true);
SetPageAnonExclusive(&folio->page);