diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 631 |
1 files changed, 429 insertions, 202 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 30de18c4fd28..bada249b9fb7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -46,6 +46,8 @@ #include <linux/pipe_fs_i.h> #include <linux/splice.h> #include <linux/rcupdate_wait.h> +#include <linux/sched/mm.h> +#include <linux/sysctl.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -112,27 +114,17 @@ * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) - * ->lruvec->lru_lock (follow_page->mark_page_accessed) - * ->lruvec->lru_lock (check_pte_range->isolate_lru_page) + * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) + * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) * ->private_lock (folio_remove_rmap_pte->set_page_dirty) * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) - * ->memcg->move_lock (folio_remove_rmap_pte->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) */ -static void mapping_set_update(struct xa_state *xas, - struct address_space *mapping) -{ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - xas_set_update(xas, workingset_update_node); - xas_set_lru(xas, &shadow_nodes); -} - static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { @@ -150,7 +142,7 @@ static void page_cache_delete(struct address_space *mapping, xas_init_marks(&xas); folio->mapping = NULL; - /* Leave page->index set: truncation lookup relies upon it */ + /* Leave folio->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } @@ -168,7 +160,7 @@ static void filemap_unaccount_folio(struct address_space *mapping, add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); if (mapping_exiting(mapping) && !folio_test_large(folio)) { - int mapcount = page_mapcount(&folio->page); + int mapcount = folio_mapcount(folio); if (folio_ref_count(folio) >= mapcount + 2) { /* @@ -177,7 +169,7 @@ static void filemap_unaccount_folio(struct address_space *mapping, * and we'd rather not leak it: if we're wrong, * another bad page check should catch it later. */ - page_mapcount_reset(&folio->page); + atomic_set(&folio->_mapcount, -1); folio_ref_sub(folio, mapcount); } } @@ -235,15 +227,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*free_folio)(struct folio *); - int refs = 1; free_folio = mapping->a_ops->free_folio; if (free_folio) free_folio(folio); - if (folio_test_large(folio)) - refs = folio_nr_pages(folio); - folio_put_refs(folio, refs); + folio_put_refs(folio, folio_nr_pages(folio)); } /** @@ -450,6 +439,24 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, EXPORT_SYMBOL(filemap_fdatawrite_range); /** + * filemap_fdatawrite_range_kick - start writeback on a range + * @mapping: target address_space + * @start: index to start writeback on + * @end: last (inclusive) index for writeback + * + * This is a non-integrity writeback helper, to start writing back folios + * for the indicated range. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); +} +EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); + +/** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space * @@ -530,7 +537,6 @@ static void __filemap_fdatawait_range(struct address_space *mapping, struct folio *folio = fbatch.folios[i]; folio_wait_writeback(folio); - folio_clear_error(folio); } folio_batch_release(&fbatch); cond_resched(); @@ -851,24 +857,19 @@ EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { - XA_STATE(xas, &mapping->i_pages, index); - bool huge = folio_test_hugetlb(folio); - bool charged = false; - long nr = 1; + XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); + bool huge; + long nr; + unsigned int forder = folio_order(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); + VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping), + folio); mapping_set_update(&xas, mapping); - if (!huge) { - int error = mem_cgroup_charge(folio, NULL, gfp); - if (error) - return error; - charged = true; - } - VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); - xas_set_order(&xas, index, folio_order(folio)); + huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); gfp &= GFP_RECLAIM_MASK; @@ -876,13 +877,10 @@ noinline int __filemap_add_folio(struct address_space *mapping, folio->mapping = mapping; folio->index = xas.xa_index; - do { - unsigned int order = xa_get_order(xas.xa, xas.xa_index); + for (;;) { + int order = -1; void *entry, *old = NULL; - if (order > folio_order(folio)) - xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), - order, gfp); xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; @@ -890,19 +888,37 @@ noinline int __filemap_add_folio(struct address_space *mapping, xas_set_err(&xas, -EEXIST); goto unlock; } + /* + * If a larger entry exists, + * it will be the first and only entry iterated. + */ + if (order == -1) + order = xas_get_order(&xas); } if (old) { - if (shadowp) - *shadowp = old; - /* entry may have been split before we acquired lock */ - order = xa_get_order(xas.xa, xas.xa_index); - if (order > folio_order(folio)) { + if (order > 0 && order > forder) { + unsigned int split_order = max(forder, + xas_try_split_min_order(order)); + /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); - xas_split(&xas, old, order); + + while (order > forder) { + xas_set_order(&xas, index, split_order); + xas_try_split(&xas, old, order); + if (xas_error(&xas)) + goto unlock; + order = split_order; + split_order = + max(xas_try_split_min_order( + split_order), + forder); + } xas_reset(&xas); } + if (shadowp) + *shadowp = old; } xas_store(&xas, folio); @@ -918,9 +934,13 @@ noinline int __filemap_add_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } + unlock: xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); + + if (!xas_nomem(&xas, gfp)) + break; + } if (xas_error(&xas)) goto error; @@ -928,10 +948,8 @@ unlock: trace_mm_filemap_add_to_page_cache(folio); return 0; error: - if (charged) - mem_cgroup_uncharge(folio); folio->mapping = NULL; - /* Leave page->index set: truncation relies upon it */ + /* Leave folio->index set: truncation relies upon it */ folio_put_refs(folio, nr); return xas_error(&xas); } @@ -943,11 +961,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, void *shadow = NULL; int ret; + ret = mem_cgroup_charge(folio, NULL, gfp); + if (ret) + return ret; + __folio_set_locked(folio); ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); - if (unlikely(ret)) + if (unlikely(ret)) { + mem_cgroup_uncharge(folio); __folio_clear_locked(folio); - else { + } else { /* * The folio might have been evicted from cache only * recently, in which case it should be activated like @@ -966,7 +989,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) { int n; struct folio *folio; @@ -976,14 +999,14 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); - folio = __folio_alloc_node(gfp, order, n); + folio = __folio_alloc_node_noprof(gfp, order, n); } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); return folio; } - return folio_alloc(gfp, order); + return folio_alloc_noprof(gfp, order); } -EXPORT_SYMBOL(filemap_alloc_folio); +EXPORT_SYMBOL(filemap_alloc_folio_noprof); #endif /* @@ -1043,6 +1066,19 @@ static wait_queue_head_t *folio_waitqueue(struct folio *folio) return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } +/* How many times do we accept lock stealing from under a waiter? */ +static int sysctl_page_lock_unfairness = 5; +static const struct ctl_table filemap_sysctl_table[] = { + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +}; + void __init pagecache_init(void) { int i; @@ -1051,6 +1087,7 @@ void __init pagecache_init(void) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); + register_sysctl_init("vm", filemap_sysctl_table); } /* @@ -1198,9 +1235,6 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, return true; } -/* How many times do we accept lock stealing from under a waiter? */ -int sysctl_page_lock_unfairness = 5; - static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { @@ -1344,7 +1378,7 @@ repeat: * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is - * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except + * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. @@ -1448,25 +1482,6 @@ static int folio_put_wait_locked(struct folio *folio, int state) } /** - * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue - * @folio: Folio defining the wait queue of interest - * @waiter: Waiter to add to the queue - * - * Add an arbitrary @waiter to the wait queue for the nominated @folio. - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) -{ - wait_queue_head_t *q = folio_waitqueue(folio); - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, waiter); - folio_set_waiters(folio); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(folio_add_wait_queue); - -/** * folio_unlock - Unlock a locked folio. * @folio: The folio. * @@ -1507,7 +1522,7 @@ void folio_end_read(struct folio *folio, bool success) /* Must be in bottom byte for x86 to work */ BUILD_BUG_ON(PG_uptodate > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); + VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio); if (likely(success)) mask |= 1 << PG_uptodate; @@ -1540,7 +1555,7 @@ EXPORT_SYMBOL(folio_end_private_2); * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio. + * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { @@ -1553,8 +1568,8 @@ EXPORT_SYMBOL(folio_wait_private_2); * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * - * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a - * fatal signal is received by the calling task. + * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is + * received by the calling task. * * Return: * - 0 if successful. @@ -1574,6 +1589,43 @@ int folio_wait_private_2_killable(struct folio *folio) } EXPORT_SYMBOL(folio_wait_private_2_killable); +static void filemap_end_dropbehind(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (!folio_test_clear_dropbehind(folio)) + return; + if (mapping) + folio_unmap_invalidate(mapping, folio, 0); +} + +/* + * If folio was marked as dropbehind, then pages should be dropped when writeback + * completes. Do that now. If we fail, it's likely because of a big folio - + * just reset dropbehind for that case and latter completions should invalidate. + */ +static void filemap_end_dropbehind_write(struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + + /* + * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, + * but can happen if normal writeback just happens to find dirty folios + * that were created as part of uncached writeback, and that writeback + * would otherwise not need non-IRQ handling. Just skip the + * invalidation in that case. + */ + if (in_task() && folio_trylock(folio)) { + filemap_end_dropbehind(folio); + folio_unlock(folio); + } +} + /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. @@ -1607,6 +1659,8 @@ void folio_end_writeback(struct folio *folio) folio_get(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); + + filemap_end_dropbehind_write(folio); acct_reclaim_writeback(folio); folio_put(folio); } @@ -1728,12 +1782,12 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) - break; + return xas.xa_index; if (xas.xa_index == 0) - break; + return 0; } - return xas.xa_index; + return index + max_scan; } EXPORT_SYMBOL(page_cache_next_miss); @@ -1786,7 +1840,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); * C. Return the page to the page allocator * * This means that any page may have its reference count temporarily - * increased by a speculative page cache (or fast GUP) lookup as it can + * increased by a speculative page cache (or GUP-fast) lookup as it can * be allocated by another user before the RCU grace period expires. * Because the refcount temporarily acquired here may end up being the * last refcount on the page, any page allocation must be freeable by @@ -1823,7 +1877,7 @@ repeat: if (!folio || xa_is_value(folio)) goto out; - if (!folio_try_get_rcu(folio)) + if (!folio_try_get(folio)) goto repeat; if (unlikely(folio != xas_reload(&xas))) { @@ -1895,8 +1949,10 @@ repeat: folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { - unsigned order = FGF_GET_ORDER(fgp_flags); + unsigned int min_order = mapping_min_folio_order(mapping); + unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags)); int err; + index = mapping_align_index(mapping, index); if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; @@ -1909,10 +1965,8 @@ no_page: if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; - if (!mapping_large_folio_support(mapping)) - order = 0; - if (order > MAX_PAGECACHE_ORDER) - order = MAX_PAGECACHE_ORDER; + if (order > mapping_max_folio_order(mapping)) + order = mapping_max_folio_order(mapping); /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) order = __ffs(index); @@ -1921,7 +1975,7 @@ no_page: gfp_t alloc_gfp = gfp; err = -ENOMEM; - if (order > 0) + if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); if (!folio) @@ -1930,18 +1984,31 @@ no_page: /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); + if (fgp_flags & FGP_DONTCACHE) + __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) break; folio_put(folio); folio = NULL; - } while (order-- > 0); + } while (order-- > min_order); if (err == -EEXIST) goto repeat; - if (err) + if (err) { + /* + * When NOWAIT I/O fails to allocate folios this could + * be due to a nonblocking memory allocation and not + * because the system actually is out of memory. + * Return -EAGAIN so that there caller retries in a + * blocking fashion instead of propagating -ENOMEM + * to the application. + */ + if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) + err = -EAGAIN; return ERR_PTR(err); + } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. @@ -1952,6 +2019,9 @@ no_page: if (!folio) return ERR_PTR(-ENOENT); + /* not an uncached lookup, clear uncached if set */ + if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) + folio_clear_dropbehind(folio); return folio; } EXPORT_SYMBOL(__filemap_get_folio); @@ -1977,7 +2047,7 @@ retry: if (!folio || xa_is_value(folio)) return folio; - if (!folio_try_get_rcu(folio)) + if (!folio_try_get(folio)) goto reset; if (unlikely(folio != xas_reload(xas))) { @@ -2023,17 +2093,20 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, if (!folio_batch_add(fbatch, folio)) break; } - rcu_read_unlock(); if (folio_batch_count(fbatch)) { - unsigned long nr = 1; + unsigned long nr; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); - *start = indices[idx] + nr; + else + nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); + *start = round_down(indices[idx] + nr, nr); } + rcu_read_unlock(); + return folio_batch_count(fbatch); } @@ -2065,10 +2138,17 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + unsigned long base; + unsigned long nr; + if (!xa_is_value(folio)) { - if (folio->index < *start) + nr = folio_nr_pages(folio); + base = folio->index; + /* Omit large folio which begins before the start */ + if (base < *start) goto put; - if (folio_next_index(folio) - 1 > end) + /* Omit large folio which extends beyond the end */ + if (base + nr - 1 > end) goto put; if (!folio_trylock(folio)) goto put; @@ -2077,7 +2157,19 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); + } else { + nr = 1 << xas_get_order(&xas); + base = xas.xa_index & ~(nr - 1); + /* Omit order>0 value which begins before the start */ + if (base < *start) + continue; + /* Omit order>0 value which extends beyond the end */ + if (base + nr - 1 > end) + break; } + + /* Update start now so that last update is correct on return */ + *start = base + nr; indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; @@ -2089,15 +2181,6 @@ put: } rcu_read_unlock(); - if (folio_batch_count(fbatch)) { - unsigned long nr = 1; - int idx = folio_batch_count(fbatch) - 1; - - folio = fbatch->folios[idx]; - if (!xa_is_value(folio)) - nr = folio_nr_pages(folio); - *start = indices[idx] + nr; - } return folio_batch_count(fbatch); } @@ -2157,7 +2240,11 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, if (xa_is_value(folio)) goto update_start; - if (!folio_try_get_rcu(folio)) + /* If we landed in the middle of a THP, continue at its end. */ + if (xa_is_sibling(folio)) + goto update_start; + + if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) @@ -2168,6 +2255,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, *start = folio->index + nr; goto out; } + xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); @@ -2289,7 +2377,7 @@ static void filemap_get_read_batch(struct address_space *mapping, break; if (xa_is_sibling(folio)) break; - if (!folio_try_get_rcu(folio)) + if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) @@ -2318,13 +2406,6 @@ static int filemap_read_folio(struct file *file, filler_t filler, unsigned long pflags; int error; - /* - * A previous I/O error may have been due to temporary failures, - * eg. multipath errors. PG_error will be set again if read_folio - * fails. - */ - folio_clear_error(folio); - /* Start the actual read. The read will unlock the page. */ if (unlikely(workingset)) psi_memstall_enter(&pflags); @@ -2424,16 +2505,22 @@ unlock_mapping: return error; } -static int filemap_create_folio(struct file *file, - struct address_space *mapping, pgoff_t index, - struct folio_batch *fbatch) +static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; + unsigned int min_order = mapping_min_folio_order(mapping); + pgoff_t index; + + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; + if (iocb->ki_flags & IOCB_DONTCACHE) + __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock @@ -2449,6 +2536,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); + index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2456,7 +2544,8 @@ static int filemap_create_folio(struct file *file, if (error) goto error; - error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); if (error) goto error; @@ -2477,6 +2566,8 @@ static int filemap_readahead(struct kiocb *iocb, struct file *file, if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } @@ -2486,10 +2577,10 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; + unsigned int flags; int err = 0; /* "last_index" is the index of the page beyond the end of the read */ @@ -2500,17 +2591,21 @@ retry: filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; - page_cache_sync_readahead(mapping, ra, filp, index, - last_index - index); + if (iocb->ki_flags & IOCB_NOWAIT) + flags = memalloc_noio_save(); + if (iocb->ki_flags & IOCB_DONTCACHE) + ractl.dropbehind = 1; + page_cache_sync_ra(&ractl, last_index - index); + if (iocb->ki_flags & IOCB_NOWAIT) + memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { - if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) - return -EAGAIN; - err = filemap_create_folio(filp, mapping, - iocb->ki_pos >> PAGE_SHIFT, fbatch); + err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; @@ -2532,6 +2627,7 @@ retry: goto err; } + trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0; err: if (err < 0) @@ -2550,6 +2646,18 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) return (pos1 >> shift == pos2 >> shift); } +static void filemap_end_dropbehind_read(struct folio *folio) +{ + if (!folio_test_dropbehind(folio)) + return; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return; + if (folio_trylock(folio)) { + filemap_end_dropbehind(folio); + folio_unlock(folio); + } +} + /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. @@ -2576,12 +2684,14 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; + if (unlikely(iocb->ki_pos < 0)) + return -EINVAL; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) return 0; - iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { @@ -2661,8 +2771,12 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, } } put_folios: - for (i = 0; i < folio_batch_count(&fbatch); i++) - folio_put(fbatch.folios[i]); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + filemap_end_dropbehind_read(folio); + folio_put(folio); + } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); @@ -2688,14 +2802,12 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count) } EXPORT_SYMBOL_GPL(kiocb_write_and_wait); -int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +int filemap_invalidate_pages(struct address_space *mapping, + loff_t pos, loff_t end, bool nowait) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - loff_t pos = iocb->ki_pos; - loff_t end = pos + count - 1; int ret; - if (iocb->ki_flags & IOCB_NOWAIT) { + if (nowait) { /* we could block if there are any pages in the range */ if (filemap_range_has_page(mapping, pos, end)) return -EAGAIN; @@ -2714,6 +2826,15 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } + +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + + return filemap_invalidate_pages(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1, + iocb->ki_flags & IOCB_NOWAIT); +} EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** @@ -2796,8 +2917,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; - while (spliced < size && - !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); @@ -2854,7 +2974,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -2914,7 +3034,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) goto out; } @@ -2953,7 +3073,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas, if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; - start = (start + bsz) & ~(bsz - 1); + start = (start + bsz) & ~((u64)bsz - 1); offset += bsz; } while (offset < folio_size(folio)); unlock: @@ -2965,7 +3085,7 @@ unlock: static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { if (xa_is_value(folio)) - return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); + return PAGE_SIZE << xas_get_order(xas); return folio_size(folio); } @@ -3100,7 +3220,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ - if (vm_flags & VM_HUGEPAGE) { + if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; @@ -3207,7 +3327,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return 0; - ptep = pte_offset_map(vmf->pmd, vmf->address); + ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); if (unlikely(!ptep)) return VM_FAULT_NOPAGE; @@ -3262,6 +3383,8 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; + trace_mm_filemap_fault(mapping, index); + /* * Do we have something in the page cache already? */ @@ -3419,7 +3542,7 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { struct page *page = folio_file_page(folio, start); - vm_fault_t ret = do_set_pmd(vmf, page); + vm_fault_t ret = do_set_pmd(vmf, folio, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ folio_unlock(folio); @@ -3446,10 +3569,10 @@ static struct folio *next_uptodate_folio(struct xa_state *xas, continue; if (xa_is_value(folio)) continue; - if (folio_test_locked(folio)) - continue; - if (!folio_try_get_rcu(folio)) + if (!folio_try_get(folio)) continue; + if (folio_test_locked(folio)) + goto skip; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) goto skip; @@ -3481,7 +3604,7 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned int *mmap_miss) + unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = folio_page(folio, start); @@ -3492,7 +3615,15 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, if (PageHWPoison(page + count)) goto skip; - (*mmap_miss)++; + /* + * If there are too many folios that are recently evicted + * in a file, they will probably continue to be evicted. + * In such situation, read-ahead is only a waste of IO. + * Don't decrease mmap_miss in this scenario to make sure + * we can stop read-ahead. + */ + if (!folio_test_workingset(folio)) + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3507,6 +3638,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, skip: if (count) { set_pte_range(vmf, folio, page, count, addr); + *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; @@ -3521,6 +3653,7 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); + *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; @@ -3533,7 +3666,7 @@ skip: static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, - unsigned int *mmap_miss) + unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; @@ -3541,7 +3674,9 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, if (PageHWPoison(page)) return ret; - (*mmap_miss)++; + /* See comment of filemap_map_folio_range() */ + if (!folio_test_workingset(folio)) + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3555,6 +3690,7 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); + (*rss)++; folio_ref_inc(folio); return ret; @@ -3566,12 +3702,13 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; - pgoff_t last_pgoff = start_pgoff; + pgoff_t file_end, last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; - unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved; + unsigned long rss = 0; + unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); @@ -3590,6 +3727,12 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, folio_put(folio); goto out; } + + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; + if (end_pgoff > file_end) + end_pgoff = file_end; + + folio_type = mm_counter_file(folio); do { unsigned long end; @@ -3601,16 +3744,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio_test_large(folio)) ret |= filemap_map_order0_folio(vmf, - folio, addr, &mmap_miss); + folio, addr, &rss, &mmap_miss); else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, - nr_pages, &mmap_miss); + nr_pages, &rss, &mmap_miss); folio_unlock(folio); folio_put(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); + add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); + trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff); out: rcu_read_unlock(); @@ -3708,9 +3853,11 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { - folio = filemap_alloc_folio(gfp, 0); + folio = filemap_alloc_folio(gfp, + mapping_min_folio_order(mapping)); if (!folio) return ERR_PTR(-ENOMEM); + index = mapping_align_index(mapping, index); err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { folio_put(folio); @@ -3940,31 +4087,22 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) loff_t pos = iocb->ki_pos; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; + size_t chunk = mapping_max_folio_size(mapping); long status = 0; ssize_t written = 0; do { - struct page *page; - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ + struct folio *folio; + size_t offset; /* Offset into folio */ + size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ void *fsdata = NULL; - offset = (pos & (PAGE_SIZE - 1)); - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_count(i)); - -again: - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - */ - if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { - status = -EFAULT; - break; - } + bytes = iov_iter_count(i); +retry: + offset = pos & (chunk - 1); + bytes = min(chunk - offset, bytes); + balance_dirty_pages_ratelimited(mapping); if (fatal_signal_pending(current)) { status = -EINTR; @@ -3972,18 +4110,28 @@ again: } status = a_ops->write_begin(file, mapping, pos, bytes, - &page, &fsdata); + &folio, &fsdata); if (unlikely(status < 0)) break; + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; + if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + flush_dcache_folio(folio); - copied = copy_page_from_iter_atomic(page, offset, bytes, i); - flush_dcache_page(page); + /* + * Faults here on mmap()s can recurse into arbitrary + * filesystem code. Lots of locks are held that can + * deadlock. Use an atomic copy to avoid deadlocking + * in page fault handling. + */ + copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); + flush_dcache_folio(folio); status = a_ops->write_end(file, mapping, pos, bytes, copied, - page, fsdata); + folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); if (unlikely(status < 0)) @@ -3998,14 +4146,26 @@ again: * halfway through, might be a race with munmap, * might be severe memory pressure. */ - if (copied) + if (chunk > PAGE_SIZE) + chunk /= 2; + if (copied) { bytes = copied; - goto again; - } - pos += status; - written += status; + goto retry; + } - balance_dirty_pages_ratelimited(mapping); + /* + * 'folio' is now unlocked and faults on it can be + * handled. Ensure forward progress by trying to + * fault it in now. + */ + if (fault_in_iov_iter_readable(i, bytes) == bytes) { + status = -EFAULT; + break; + } + } else { + pos += status; + written += status; + } } while (iov_iter_count(i)); if (!written) @@ -4134,6 +4294,60 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) } EXPORT_SYMBOL(filemap_release_folio); +/** + * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache + * @inode: The inode to flush + * @flush: Set to write back rather than simply invalidate. + * @start: First byte to in range. + * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start + * onwards. + * + * Invalidate all the folios on an inode that contribute to the specified + * range, possibly writing them back first. Whilst the operation is + * undertaken, the invalidate lock is held to prevent new folios from being + * installed. + */ +int filemap_invalidate_inode(struct inode *inode, bool flush, + loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t first = start >> PAGE_SHIFT; + pgoff_t last = end >> PAGE_SHIFT; + pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; + + if (!mapping || !mapping->nrpages || end < start) + goto out; + + /* Prevent new folios from being added to the inode. */ + filemap_invalidate_lock(mapping); + + if (!mapping->nrpages) + goto unlock; + + unmap_mapping_pages(mapping, first, nr, false); + + /* Write back the data if we're asked to. */ + if (flush) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + + filemap_fdatawrite_wbc(mapping, &wbc); + } + + /* Wait for writeback to complete on all folios and discard. */ + invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); + +unlock: + filemap_invalidate_unlock(mapping); +out: + return filemap_check_errors(mapping); +} +EXPORT_SYMBOL_GPL(filemap_invalidate_inode); + #ifdef CONFIG_CACHESTAT_SYSCALL /** * filemap_cachestat() - compute the page cache statistics of a mapping @@ -4153,6 +4367,9 @@ static void filemap_cachestat(struct address_space *mapping, XA_STATE(xas, &mapping->i_pages, first_index); struct folio *folio; + /* Flush stats (and potentially sleep) outside the RCU read section. */ + mem_cgroup_flush_stats_ratelimited(NULL); + rcu_read_lock(); xas_for_each(&xas, folio, last_index) { int order; @@ -4173,7 +4390,7 @@ static void filemap_cachestat(struct address_space *mapping, if (xas_retry(&xas, folio)) continue; - order = xa_get_order(xas.xa, xas.xa_index); + order = xas_get_order(&xas); nr_pages = 1 << order; folio_first_index = round_down(xas.xa_index, 1 << order); folio_last_index = folio_first_index + nr_pages - 1; @@ -4216,7 +4433,7 @@ static void filemap_cachestat(struct address_space *mapping, goto resched; } #endif - if (workingset_test_recent(shadow, true, &workingset)) + if (workingset_test_recent(shadow, true, &workingset, false)) cs->nr_recently_evicted += nr_pages; goto resched; @@ -4241,6 +4458,20 @@ resched: } /* + * See mincore: reveal pagecache information only for files + * that the calling process has write access to, or could (if + * tried) open for writing. + */ +static inline bool can_do_cachestat(struct file *f) +{ + if (f->f_mode & FMODE_WRITE) + return true; + if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f))) + return true; + return file_permission(f, MAY_WRITE) == 0; +} + +/* * The cachestat(2) system call. * * cachestat() returns the page cache statistics of a file in the @@ -4278,39 +4509,35 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd, struct cachestat_range __user *, cstat_range, struct cachestat __user *, cstat, unsigned int, flags) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct address_space *mapping; struct cachestat_range csr; struct cachestat cs; pgoff_t first_index, last_index; - if (!f.file) + if (fd_empty(f)) return -EBADF; if (copy_from_user(&csr, cstat_range, - sizeof(struct cachestat_range))) { - fdput(f); + sizeof(struct cachestat_range))) return -EFAULT; - } /* hugetlbfs is not supported */ - if (is_file_hugepages(f.file)) { - fdput(f); + if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; - } - if (flags != 0) { - fdput(f); + if (!can_do_cachestat(fd_file(f))) + return -EPERM; + + if (flags != 0) return -EINVAL; - } first_index = csr.off >> PAGE_SHIFT; last_index = csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; memset(&cs, 0, sizeof(struct cachestat)); - mapping = f.file->f_mapping; + mapping = fd_file(f)->f_mapping; filemap_cachestat(mapping, first_index, last_index, &cs); - fdput(f); if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) return -EFAULT; |