summaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c121
1 files changed, 83 insertions, 38 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 750e779c23db..7437b2bd75c1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -124,6 +124,15 @@
* ->private_lock (zap_pte_range->block_dirty_folio)
*/
+static void mapping_set_update(struct xa_state *xas,
+ struct address_space *mapping)
+{
+ if (dax_mapping(mapping) || shmem_mapping(mapping))
+ return;
+ xas_set_update(xas, workingset_update_node);
+ xas_set_lru(xas, &shadow_nodes);
+}
+
static void page_cache_delete(struct address_space *mapping,
struct folio *folio, void *shadow)
{
@@ -843,7 +852,7 @@ noinline int __filemap_add_folio(struct address_space *mapping,
struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, index);
- int huge = folio_test_hugetlb(folio);
+ bool huge = folio_test_hugetlb(folio);
bool charged = false;
long nr = 1;
@@ -1354,7 +1363,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
unsigned long pflags;
bool in_thrashing;
wait_queue_head_t *q;
- struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+ struct folio *folio = pfn_swap_entry_folio(entry);
q = folio_waitqueue(folio);
if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
@@ -1912,8 +1921,6 @@ no_page:
gfp_t alloc_gfp = gfp;
err = -ENOMEM;
- if (order == 1)
- order = 0;
if (order > 0)
alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
folio = filemap_alloc_folio(alloc_gfp, order);
@@ -2609,15 +2616,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
/*
- * Pairs with a barrier in
- * block_write_end()->mark_buffer_dirty() or other page
- * dirtying routines like iomap_write_end() to ensure
- * changes to page contents are visible before we see
- * increased inode size.
- */
- smp_rmb();
-
- /*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
*/
@@ -3183,6 +3181,48 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
return fpin;
}
+static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = 0;
+ pte_t *ptep;
+
+ /*
+ * We might have COW'ed a pagecache folio and might now have an mlocked
+ * anon folio mapped. The original pagecache folio is not mlocked and
+ * might have been evicted. During a read+clear/modify/write update of
+ * the PTE, such as done in do_numa_page()/change_pte_range(), we
+ * temporarily clear the PTE under PT lock and might detect it here as
+ * "none" when not holding the PT lock.
+ *
+ * Not rechecking the PTE under PT lock could result in an unexpected
+ * major fault in an mlock'ed region. Recheck only for this special
+ * scenario while holding the PT lock, to not degrade non-mlocked
+ * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
+ * the number of times we hold PT lock.
+ */
+ if (!(vma->vm_flags & VM_LOCKED))
+ return 0;
+
+ if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+ return 0;
+
+ ptep = pte_offset_map(vmf->pmd, vmf->address);
+ if (unlikely(!ptep))
+ return VM_FAULT_NOPAGE;
+
+ if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
+ ret = VM_FAULT_NOPAGE;
+ } else {
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_none(ptep_get(ptep))))
+ ret = VM_FAULT_NOPAGE;
+ spin_unlock(vmf->ptl);
+ }
+ pte_unmap(ptep);
+ return ret;
+}
+
/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
@@ -3238,6 +3278,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
mapping_locked = true;
}
} else {
+ ret = filemap_fault_recheck_pte_none(vmf);
+ if (unlikely(ret))
+ return ret;
+
/* No page in the page cache at all */
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
@@ -4111,28 +4155,40 @@ static void filemap_cachestat(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, folio, last_index) {
+ int order;
unsigned long nr_pages;
pgoff_t folio_first_index, folio_last_index;
+ /*
+ * Don't deref the folio. It is not pinned, and might
+ * get freed (and reused) underneath us.
+ *
+ * We *could* pin it, but that would be expensive for
+ * what should be a fast and lightweight syscall.
+ *
+ * Instead, derive all information of interest from
+ * the rcu-protected xarray.
+ */
+
if (xas_retry(&xas, folio))
continue;
+ order = xa_get_order(xas.xa, xas.xa_index);
+ nr_pages = 1 << order;
+ folio_first_index = round_down(xas.xa_index, 1 << order);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
if (xa_is_value(folio)) {
/* page is evicted */
void *shadow = (void *)folio;
bool workingset; /* not used */
- int order = xa_get_order(xas.xa, xas.xa_index);
-
- nr_pages = 1 << order;
- folio_first_index = round_down(xas.xa_index, 1 << order);
- folio_last_index = folio_first_index + nr_pages - 1;
-
- /* Folios might straddle the range boundaries, only count covered pages */
- if (folio_first_index < first_index)
- nr_pages -= first_index - folio_first_index;
-
- if (folio_last_index > last_index)
- nr_pages -= folio_last_index - last_index;
cs->nr_evicted += nr_pages;
@@ -4150,24 +4206,13 @@ static void filemap_cachestat(struct address_space *mapping,
goto resched;
}
- nr_pages = folio_nr_pages(folio);
- folio_first_index = folio_pgoff(folio);
- folio_last_index = folio_first_index + nr_pages - 1;
-
- /* Folios might straddle the range boundaries, only count covered pages */
- if (folio_first_index < first_index)
- nr_pages -= first_index - folio_first_index;
-
- if (folio_last_index > last_index)
- nr_pages -= folio_last_index - last_index;
-
/* page is in cache */
cs->nr_cache += nr_pages;
- if (folio_test_dirty(folio))
+ if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
cs->nr_dirty += nr_pages;
- if (folio_test_writeback(folio))
+ if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
cs->nr_writeback += nr_pages;
resched: